blob: c2797507306a80f22b15b53068da2ba1a213ac53 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
#!/usr/bin/perl
use strict;
use warnings;
my $filename = "sample.out";
open(my $fh, $filename) or die "could not open $filename";
my %documents;
while (my $line=<$fh>) {
$line =~ /File-([^-]+)-([^:]+): ([^\n]*)\n/s;
my $fileid = $1; # numeric or "dist"
my $field = $2; # string, non-empty
my $value = $3; # string, may be empty
#print "Fileid: ". $fileid . "\n";
#print "field: ". $field . "\n";
#print "Value: ". $value . "\n";
if ( ! $documents{$fileid} ) {
$documents{$fileid} = { $field => $value };
} else {
$documents{$fileid}{$field} = $value;
}
}
close($fh);
# Fields for indexing.
# our %fields = (
# distfile => 'text',
# filename => 'text',
# isdist => 'UnAnalyzedField',
# size => 'UnAnalyzedField',
# mtime => 'UnAnalyzedField',
# md5 => 'UnAnalyzedField',
# sha1 => 'UnAnalyzedField',
# );
# analyzer should simply tokenize filenames by its parts
# i would split up by [/.-_] at least. technically, using
# (\W|_|\d) as the class of split characters might be reasonable
|