From e6cc6776a13fab29b4000268d259176080569fb7 Mon Sep 17 00:00:00 2001 From: Robert Buchholz Date: Thu, 20 Mar 2008 20:55:44 +0000 Subject: Initial version git-svn-id: https://overlays.gentoo.org/svn/dev/rbu/distindex@27 130f8837-a733-0410-98ec-ba4ccdad31c5 --- read-index.pl | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 read-index.pl (limited to 'read-index.pl') diff --git a/read-index.pl b/read-index.pl new file mode 100644 index 0000000..c279750 --- /dev/null +++ b/read-index.pl @@ -0,0 +1,43 @@ +#!/usr/bin/perl +use strict; +use warnings; + +my $filename = "sample.out"; + +open(my $fh, $filename) or die "could not open $filename"; + +my %documents; +while (my $line=<$fh>) { + $line =~ /File-([^-]+)-([^:]+): ([^\n]*)\n/s; + my $fileid = $1; # numeric or "dist" + my $field = $2; # string, non-empty + my $value = $3; # string, may be empty + #print "Fileid: ". $fileid . "\n"; + #print "field: ". $field . "\n"; + #print "Value: ". $value . "\n"; + + if ( ! $documents{$fileid} ) { + $documents{$fileid} = { $field => $value }; + } else { + $documents{$fileid}{$field} = $value; + } +} +close($fh); + + +# Fields for indexing. + +# our %fields = ( +# distfile => 'text', +# filename => 'text', +# isdist => 'UnAnalyzedField', +# size => 'UnAnalyzedField', +# mtime => 'UnAnalyzedField', +# md5 => 'UnAnalyzedField', +# sha1 => 'UnAnalyzedField', +# ); + +# analyzer should simply tokenize filenames by its parts +# i would split up by [/.-_] at least. technically, using +# (\W|_|\d) as the class of split characters might be reasonable + -- cgit v1.2.3-65-gdbad