diff options
author | Robert Buchholz <rbu@gentoo.org> | 2008-03-20 20:55:44 +0000 |
---|---|---|
committer | Robert Buchholz <rbu@gentoo.org> | 2008-03-20 20:55:44 +0000 |
commit | e6cc6776a13fab29b4000268d259176080569fb7 (patch) | |
tree | ccd46a8f83c5b2f77192365a039f79390945b10c /read-index.pl | |
download | distindex-e6cc6776a13fab29b4000268d259176080569fb7.tar.gz distindex-e6cc6776a13fab29b4000268d259176080569fb7.tar.bz2 distindex-e6cc6776a13fab29b4000268d259176080569fb7.zip |
Initial version
git-svn-id: https://overlays.gentoo.org/svn/dev/rbu/distindex@27 130f8837-a733-0410-98ec-ba4ccdad31c5
Diffstat (limited to 'read-index.pl')
-rw-r--r-- | read-index.pl | 43 |
1 files changed, 43 insertions, 0 deletions
diff --git a/read-index.pl b/read-index.pl new file mode 100644 index 0000000..c279750 --- /dev/null +++ b/read-index.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl +use strict; +use warnings; + +my $filename = "sample.out"; + +open(my $fh, $filename) or die "could not open $filename"; + +my %documents; +while (my $line=<$fh>) { + $line =~ /File-([^-]+)-([^:]+): ([^\n]*)\n/s; + my $fileid = $1; # numeric or "dist" + my $field = $2; # string, non-empty + my $value = $3; # string, may be empty + #print "Fileid: ". $fileid . "\n"; + #print "field: ". $field . "\n"; + #print "Value: ". $value . "\n"; + + if ( ! $documents{$fileid} ) { + $documents{$fileid} = { $field => $value }; + } else { + $documents{$fileid}{$field} = $value; + } +} +close($fh); + + +# Fields for indexing. + +# our %fields = ( +# distfile => 'text', +# filename => 'text', +# isdist => 'UnAnalyzedField', +# size => 'UnAnalyzedField', +# mtime => 'UnAnalyzedField', +# md5 => 'UnAnalyzedField', +# sha1 => 'UnAnalyzedField', +# ); + +# analyzer should simply tokenize filenames by its parts +# i would split up by [/.-_] at least. technically, using +# (\W|_|\d) as the class of split characters might be reasonable + |