summaryrefslogtreecommitdiff
blob: c2797507306a80f22b15b53068da2ba1a213ac53 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/perl
use strict;
use warnings;

my $filename = "sample.out";

open(my $fh, $filename) or die "could not open $filename";

my %documents;
while (my $line=<$fh>) {
    $line =~ /File-([^-]+)-([^:]+): ([^\n]*)\n/s;
    my $fileid = $1; # numeric or "dist"
    my $field = $2; # string, non-empty
    my $value = $3; # string, may be empty
    #print "Fileid: ". $fileid . "\n";
    #print "field: ". $field . "\n";
    #print "Value: ". $value . "\n";

    if ( ! $documents{$fileid} ) {
        $documents{$fileid} = { $field => $value };
    } else {
        $documents{$fileid}{$field} = $value;
    }
}
close($fh);


# Fields for indexing.

# our %fields = (
#     distfile => 'text',
#     filename => 'text',
#     isdist   => 'UnAnalyzedField',
#     size     => 'UnAnalyzedField',
#     mtime    => 'UnAnalyzedField',
#     md5      => 'UnAnalyzedField',
#     sha1     => 'UnAnalyzedField',
# );

# analyzer should simply tokenize filenames by its parts
# i would split up by [/.-_] at least. technically, using
# (\W|_|\d) as the class of split characters might be reasonable