#!/usr/bin/perl use strict; use warnings; my $filename = "sample.out"; open(my $fh, $filename) or die "could not open $filename"; my %documents; while (my $line=<$fh>) { $line =~ /File-([^-]+)-([^:]+): ([^\n]*)\n/s; my $fileid = $1; # numeric or "dist" my $field = $2; # string, non-empty my $value = $3; # string, may be empty #print "Fileid: ". $fileid . "\n"; #print "field: ". $field . "\n"; #print "Value: ". $value . "\n"; if ( ! $documents{$fileid} ) { $documents{$fileid} = { $field => $value }; } else { $documents{$fileid}{$field} = $value; } } close($fh); # Fields for indexing. # our %fields = ( # distfile => 'text', # filename => 'text', # isdist => 'UnAnalyzedField', # size => 'UnAnalyzedField', # mtime => 'UnAnalyzedField', # md5 => 'UnAnalyzedField', # sha1 => 'UnAnalyzedField', # ); # analyzer should simply tokenize filenames by its parts # i would split up by [/.-_] at least. technically, using # (\W|_|\d) as the class of split characters might be reasonable