diff options
author | Robin H. Johnson <robbat2@gentoo.org> | 2008-03-20 17:40:37 -0700 |
---|---|---|
committer | Robin H. Johnson <robbat2@gentoo.org> | 2008-03-21 20:13:06 -0700 |
commit | 9a14f7fe3eb10585ead82f99a43802582ee4d5ba (patch) | |
tree | 76f49810ccd16b407ceaec1b8449e3c193c3466d /read-index.pl | |
parent | Alter the fields slightly. (diff) | |
download | distindex-9a14f7fe3eb10585ead82f99a43802582ee4d5ba.tar.gz distindex-9a14f7fe3eb10585ead82f99a43802582ee4d5ba.tar.bz2 distindex-9a14f7fe3eb10585ead82f99a43802582ee4d5ba.zip |
Add index creation script.
Diffstat (limited to 'read-index.pl')
-rw-r--r-- | read-index.pl | 64 |
1 files changed, 59 insertions, 5 deletions
diff --git a/read-index.pl b/read-index.pl index c279750..3041a1c 100644 --- a/read-index.pl +++ b/read-index.pl @@ -2,11 +2,17 @@ use strict; use warnings; +use Lucene; +use File::Basename; + +# Lucene stuff by Robin H. Johnson <robbat2@gentoo.org> + + my $filename = "sample.out"; open(my $fh, $filename) or die "could not open $filename"; -my %documents; +my %rawdocs; while (my $line=<$fh>) { $line =~ /File-([^-]+)-([^:]+): ([^\n]*)\n/s; my $fileid = $1; # numeric or "dist" @@ -15,11 +21,11 @@ while (my $line=<$fh>) { #print "Fileid: ". $fileid . "\n"; #print "field: ". $field . "\n"; #print "Value: ". $value . "\n"; - - if ( ! $documents{$fileid} ) { - $documents{$fileid} = { $field => $value }; + + if ( ! $rawdocs{$fileid} ) { + $rawdocs{$fileid} = { $field => $value }; } else { - $documents{$fileid}{$field} = $value; + $rawdocs{$fileid}{$field} = $value; } } close($fh); @@ -41,3 +47,51 @@ close($fh); # i would split up by [/.-_] at least. technically, using # (\W|_|\d) as the class of split characters might be reasonable +my $analyzer = new Lucene::Analysis::Standard::StandardAnalyzer(); +mkdir "data"; +my $store = Lucene::Store::FSDirectory->getDirectory("data", 0); +my $writer = new Lucene::Index::IndexWriter($store, $analyzer, 1); +$writer->setMergeFactor(100); +$writer->setUseCompoundFile(0); +$writer->setMaxFieldLength(2048); +$writer->setMinMergeDocs(10); +$writer->setMaxMergeDocs(100); + +# Add Documents here +sub createdoc { + my ($distfile, $rawdoc) = @_; + my $isdist = defined($rawdoc->{isdistfile}) && $rawdoc->{isdistfile} ? 1 : 0; + my $doc = new Lucene::Document; + $doc->add(Lucene::Document::Field->Text("distfile", $distfile)); + $doc->add(Lucene::Document::Field->Keyword("isdistfile", $isdist)); + if($isdist) { + for my $f (qw(origin cat pn cpv)) { + $doc->add(Lucene::Document::Field->Text($f, $rawdoc->{$f})) if defined($rawdoc->{$f}); + } + for my $f (qw(pv pr pf)) { + $doc->add(Lucene::Document::Field->Keyword($f, $rawdoc->{$f})) if defined($rawdoc->{$f}); + } + } else { + my $name = $rawdoc->{name}; + $doc->add(Lucene::Document::Field->Text("path", $name)); + $doc->add(Lucene::Document::Field->Text("filename", basename($name))); + $doc->add(Lucene::Document::Field->Text("directory", dirname($name))); + } + for my $f (qw(md5 sha1 mtime size)) { + $doc->add(Lucene::Document::Field->Keyword($f, $rawdoc->{$f})) if defined($rawdoc->{$f}); + } + return $doc; +} + +my $distfile = $rawdocs{dist}{name}; +foreach my $f (keys(%rawdocs)) { + printf "%s\n", $f; + my $doc = createdoc($distfile, $rawdocs{$f}); + $writer->addDocument($doc); +} + +# End of Document adding +$writer->optimize(); +$writer->close; +undef $writer; + |