summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobert Buchholz <rbu@gentoo.org>2008-03-20 20:55:44 +0000
committerRobert Buchholz <rbu@gentoo.org>2008-03-20 20:55:44 +0000
commite6cc6776a13fab29b4000268d259176080569fb7 (patch)
treeccd46a8f83c5b2f77192365a039f79390945b10c /read-index.pl
downloaddistindex-e6cc6776a13fab29b4000268d259176080569fb7.tar.gz
distindex-e6cc6776a13fab29b4000268d259176080569fb7.tar.bz2
distindex-e6cc6776a13fab29b4000268d259176080569fb7.zip
Initial version
git-svn-id: https://overlays.gentoo.org/svn/dev/rbu/distindex@27 130f8837-a733-0410-98ec-ba4ccdad31c5
Diffstat (limited to 'read-index.pl')
-rw-r--r--read-index.pl43
1 files changed, 43 insertions, 0 deletions
diff --git a/read-index.pl b/read-index.pl
new file mode 100644
index 0000000..c279750
--- /dev/null
+++ b/read-index.pl
@@ -0,0 +1,43 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+my $filename = "sample.out";
+
+open(my $fh, $filename) or die "could not open $filename";
+
+my %documents;
+while (my $line=<$fh>) {
+ $line =~ /File-([^-]+)-([^:]+): ([^\n]*)\n/s;
+ my $fileid = $1; # numeric or "dist"
+ my $field = $2; # string, non-empty
+ my $value = $3; # string, may be empty
+ #print "Fileid: ". $fileid . "\n";
+ #print "field: ". $field . "\n";
+ #print "Value: ". $value . "\n";
+
+ if ( ! $documents{$fileid} ) {
+ $documents{$fileid} = { $field => $value };
+ } else {
+ $documents{$fileid}{$field} = $value;
+ }
+}
+close($fh);
+
+
+# Fields for indexing.
+
+# our %fields = (
+# distfile => 'text',
+# filename => 'text',
+# isdist => 'UnAnalyzedField',
+# size => 'UnAnalyzedField',
+# mtime => 'UnAnalyzedField',
+# md5 => 'UnAnalyzedField',
+# sha1 => 'UnAnalyzedField',
+# );
+
+# analyzer should simply tokenize filenames by its parts
+# i would split up by [/.-_] at least. technically, using
+# (\W|_|\d) as the class of split characters might be reasonable
+