1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
#!/usr/bin/perl
use strict;
use warnings;
use Lucene;
use File::Basename;
# Lucene stuff by Robin H. Johnson <robbat2@gentoo.org>
my $filename = "sample.out";
open(my $fh, $filename) or die "could not open $filename";
my %rawdocs;
while (my $line=<$fh>) {
$line =~ /File-([^-]+)-([^:]+): ([^\n]*)\n/s;
my $fileid = $1; # numeric or "dist"
my $field = $2; # string, non-empty
my $value = $3; # string, may be empty
#print "Fileid: ". $fileid . "\n";
#print "field: ". $field . "\n";
#print "Value: ". $value . "\n";
if ( ! $rawdocs{$fileid} ) {
$rawdocs{$fileid} = { $field => $value };
} else {
$rawdocs{$fileid}{$field} = $value;
}
}
close($fh);
# Fields for indexing.
# our %fields = (
# distfile => 'text',
# filename => 'text',
# isdist => 'UnAnalyzedField',
# size => 'UnAnalyzedField',
# mtime => 'UnAnalyzedField',
# md5 => 'UnAnalyzedField',
# sha1 => 'UnAnalyzedField',
# );
# analyzer should simply tokenize filenames by its parts
# i would split up by [/.-_] at least. technically, using
# (\W|_|\d) as the class of split characters might be reasonable
my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer();
mkdir "data";
my $store = Lucene::Store::FSDirectory->getDirectory("data", 0);
my $writer = new Lucene::Index::IndexWriter($store, $analyzer, 1);
$writer->setMergeFactor(100);
$writer->setUseCompoundFile(0);
$writer->setMaxFieldLength(2048);
$writer->setMinMergeDocs(10);
$writer->setMaxMergeDocs(100);
# Add Documents here
sub createdoc {
my ($distfile, $rawdoc) = @_;
my $isdist = defined($rawdoc->{isdistfile}) && $rawdoc->{isdistfile} ? 1 : 0;
my $doc = new Lucene::Document;
$doc->add(Lucene::Document::Field->Text("distfile", $distfile));
$doc->add(Lucene::Document::Field->Keyword("isdistfile", $isdist));
if($isdist) {
for my $f (qw(origin cat pn cpv)) {
$doc->add(Lucene::Document::Field->Text($f, $rawdoc->{$f})) if defined($rawdoc->{$f});
}
for my $f (qw(pv pr pf)) {
$doc->add(Lucene::Document::Field->Keyword($f, $rawdoc->{$f})) if defined($rawdoc->{$f});
}
} else {
my $name = $rawdoc->{name};
$doc->add(Lucene::Document::Field->Text("path", $name));
$doc->add(Lucene::Document::Field->Text("filename", basename($name)));
$doc->add(Lucene::Document::Field->Text("directory", dirname($name)));
}
for my $f (qw(md5 sha1 mtime size)) {
$doc->add(Lucene::Document::Field->Keyword($f, $rawdoc->{$f})) if defined($rawdoc->{$f});
}
return $doc;
}
my $distfile = $rawdocs{dist}{name};
foreach my $f (keys(%rawdocs)) {
printf "%s\n", $f;
my $doc = createdoc($distfile, $rawdocs{$f});
$writer->addDocument($doc);
}
# End of Document adding
$writer->optimize();
$writer->close;
undef $writer;
|