List of usage examples for org.apache.lucene.index IndexReader document
public final Document document(int docID) throws IOException
n
th Document
in this index. From source file:org.kimios.kernel.index.SearchQuery.java
License:Open Source License
public Vector<Document> execute(IndexReader reader) throws IndexException { try {//from w w w .ja va 2 s . c o m if (this.searchRoot != null) { this.clauses.add(new DocumentParentClause(this.searchRoot.getPath())); } Query[] q = new Query[this.clauses.size()]; for (int i = 0; i < this.clauses.size(); i++) { q[i] = this.clauses.get(i).getLuceneQuery(); } Query query = IndexHelper.mergeQueries(IndexHelper.getAnalyzer(), q); IndexSearcher searcher = new IndexSearcher(reader); final Vector<Document> results = new Vector<Document>(); final IndexReader readerCpy = reader; TopScoreDocCollector dcColl = TopScoreDocCollector.create(Integer.MAX_VALUE, true); searcher.search(query, dcColl); for (ScoreDoc it : dcColl.topDocs().scoreDocs) { try { long uid = Long.parseLong(readerCpy.document(it.doc).get("DocumentUid")); Document r = FactoryInstantiator.getInstance().getDocumentFactory().getDocument(uid); if (r != null) { results.add(r); } } catch (Exception ex) { } } return results; } catch (Exception ex) { throw new IndexException(ex, "Error during query parsing : " + ex.getMessage()); } }
From source file:org.languagetool.dev.bigdata.GermanUppercasePhraseFinder.java
License:Open Source License
private static long getOccurrenceCount(IndexReader reader, IndexSearcher searcher, String term) throws IOException { TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 5); if (topDocs.totalHits == 0) { return 0; }//from w w w . j ava 2 s . c om int docId = topDocs.scoreDocs[0].doc; Document document = reader.document(docId); return Long.parseLong(document.get("count")); }
From source file:org.languagetool.dev.bigdata.LargestNGramFinder.java
License:Open Source License
public static void main(String[] args) throws IOException { if (args.length != 1) { System.out.println("Usage: " + LargestNGramFinder.class.getSimpleName() + " <ngramIndexDir>"); System.exit(1);//from w ww .j a va2 s . co m } FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath()); IndexReader reader = DirectoryReader.open(fsDir); IndexSearcher searcher = new IndexSearcher(reader); Fields fields = MultiFields.getFields(reader); long max = 0; String maxTerm = ""; Terms terms = fields.terms("ngram"); TermsEnum termsEnum = terms.iterator(); int count = 0; BytesRef next; while ((next = termsEnum.next()) != null) { String term = next.utf8ToString(); TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 5); int docId = topDocs.scoreDocs[0].doc; Document document = reader.document(docId); long thisCount = Long.parseLong(document.get("count")); if (max < thisCount) { max = thisCount; maxTerm = term; } if (count % 10_000 == 0) { System.out.println(count + " -> " + topDocs.totalHits + " for " + term + " -> " + thisCount + ", max so far: " + max + " for '" + maxTerm + "'"); } count++; } System.out.println("Max: " + max + " for " + maxTerm); }
From source file:org.meresco.lucene.MerescoClustererTest.java
License:Open Source License
@Test public void testClusteringOnVectorsMultipleStrategies() throws IOException, Exception { ClusterConfig clusterConfig = new ClusterConfig(42); clusterConfig.addStrategy(new ClusterStrategy(0.5, 2).addField("termvector.field", 1.0, "vuur")); clusterConfig.addStrategy(new ClusterStrategy(0.4, 1).addField("termvector.field", 1.0, null)); clusterConfig.addStrategy(new ClusterStrategy(0.4, 2).addField("termvector.field", 1.0, "anders")); InterpolateEpsilon interpolateEpsilon = new InterpolateEpsilon() { @Override// w ww .j a v a2 s. co m public double interpolateEpsilon(int hits, int sliceSize, double clusteringEps, int clusterMoreRecords) { assertEquals(100, hits); assertEquals(10, sliceSize); assertTrue(clusteringEps >= 0.4); assertEquals(42, clusterMoreRecords); return clusteringEps; } }; IndexReader indexReader = getIndexReader(); MerescoClusterer merescoClusterer = new MerescoClusterer(indexReader, clusterConfig, interpolateEpsilon, 100, 10); for (int i = 0; i < 15; i++) { merescoClusterer.collect(i); } merescoClusterer.finish(); assertEquals(3, merescoClusterer.clusters.size()); for (int i = 0; i < 15; i++) { String theID = indexReader.document(i).get(Lucene.ID_FIELD); MerescoCluster cluster = merescoClusterer.cluster(i); Set<String> ids = new HashSet<>(); for (DocScore ds : cluster.topDocs) { ids.add(indexReader.document(ds.docId).get(Lucene.ID_FIELD)); } assertTrue(ids.contains(theID)); int idOrd = Integer.valueOf(theID.split(":")[1]); if (0 <= idOrd && idOrd <= 4) { assertEquals(new HashSet<String>(Arrays.asList("id:4", "id:0", "id:1", "id:2", "id:3")), ids); } else if (5 <= idOrd && idOrd <= 9) { assertEquals(new HashSet<String>(Arrays.asList("id:8", "id:7", "id:6", "id:5", "id:9")), ids); } else { assertEquals(new HashSet<String>(Arrays.asList("id:10", "id:11", "id:12", "id:13", "id:14")), ids); } } }
From source file:org.metaservice.core.maven.MavenIndexCrawler.java
License:Apache License
public void perform() throws IOException, ComponentLookupException, InvalidVersionSpecificationException { // Files where local cache is (if any) and Lucene Index should be located File centralLocalCache = new File("target/central-cache"); File centralIndexDir = new File("target/central-index"); // Creators we want to use (search for fields it defines) List<IndexCreator> indexers = new ArrayList<>(); indexers.add(plexusContainer.lookup(IndexCreator.class, "min")); indexers.add(plexusContainer.lookup(IndexCreator.class, "jarContent")); indexers.add(plexusContainer.lookup(IndexCreator.class, "maven-plugin")); // Create context for central repository index centralContext = indexer.createIndexingContext("central-context", "central", centralLocalCache, centralIndexDir, "http://repo1.maven.org/maven2", null, true, true, indexers); // Update the index (incremental update will happen if this is not 1st run and files are not deleted) // This whole block below should not be executed on every app start, but rather controlled by some configuration // since this block will always emit at least one HTTP GET. Central indexes are updated once a week, but // other index sources might have different index publishing frequency. // Preferred frequency is once a week. if (true) {//from w w w . j av a 2 s.c o m System.out.println("Updating Index..."); System.out.println("This might take a while on first run, so please be patient!"); // Create ResourceFetcher implementation to be used with IndexUpdateRequest // Here, we use Wagon based one as shorthand, but all we need is a ResourceFetcher implementation TransferListener listener = new AbstractTransferListener() { public void transferStarted(TransferEvent transferEvent) { System.out.print(" Downloading " + transferEvent.getResource().getName()); } public void transferProgress(TransferEvent transferEvent, byte[] buffer, int length) { } public void transferCompleted(TransferEvent transferEvent) { System.out.println(" - Done"); } }; ResourceFetcher resourceFetcher = new WagonHelper.WagonFetcher(httpWagon, listener, null, null); Date centralContextCurrentTimestamp = centralContext.getTimestamp(); IndexUpdateRequest updateRequest = new IndexUpdateRequest(centralContext, resourceFetcher); IndexUpdateResult updateResult = indexUpdater.fetchAndUpdateIndex(updateRequest); if (updateResult.isFullUpdate()) { System.out.println("Full update happened!"); } else if (updateResult.getTimestamp().equals(centralContextCurrentTimestamp)) { System.out.println("No update needed, index is up to date!"); } else { System.out.println("Incremental update happened, change covered " + centralContextCurrentTimestamp + " - " + updateResult.getTimestamp() + " period."); } System.out.println(); } System.out.println(); System.out.println("Using index"); System.out.println("==========="); System.out.println(); // ==== // Case: // dump all the GAVs // NOTE: will not actually execute do this below, is too long to do (Central is HUGE), but is here as code // example int j = 0; if (true) { final IndexSearcher searcher = centralContext.acquireIndexSearcher(); try { final IndexReader ir = searcher.getIndexReader(); for (int i = 0; i < ir.maxDoc(); i++) { if (!ir.isDeleted(i)) { j++; final Document doc = ir.document(i); final ArtifactInfo ai = IndexUtils.constructArtifactInfo(doc, centralContext); if (ai != null && "pom".equals(ai.fextension)) System.out.println(ai.groupId + ":" + ai.artifactId + ":" + ai.version + ":" + ai.classifier + " (sha1=" + ai.sha1 + ")"); } } } finally { centralContext.releaseIndexSearcher(searcher); } } System.err.println(j); if (j > 0) return; // ==== // Case: // Search for all GAVs with known G and A and having version greater than V final GenericVersionScheme versionScheme = new GenericVersionScheme(); final String versionString = "1.5.0"; final Version version = versionScheme.parseVersion(versionString); // construct the query for known GA final Query groupIdQ = indexer.constructQuery(MAVEN.GROUP_ID, new SourcedSearchExpression("org.sonatype.nexus")); final Query artifactIdQ = indexer.constructQuery(MAVEN.ARTIFACT_ID, new SourcedSearchExpression("nexus-api")); final BooleanQuery query = new BooleanQuery(); query.add(groupIdQ, BooleanClause.Occur.MUST); query.add(artifactIdQ, BooleanClause.Occur.MUST); // we want "jar" artifacts only query.add(indexer.constructQuery(MAVEN.PACKAGING, new SourcedSearchExpression("jar")), BooleanClause.Occur.MUST); // we want main artifacts only (no classifier) // Note: this below is unfinished API, needs fixing query.add(indexer.constructQuery(MAVEN.CLASSIFIER, new SourcedSearchExpression(Field.NOT_PRESENT)), BooleanClause.Occur.MUST_NOT); // construct the filter to express "V greater than" final ArtifactInfoFilter versionFilter = new ArtifactInfoFilter() { public boolean accepts(final IndexingContext ctx, final ArtifactInfo ai) { try { final Version aiV = versionScheme.parseVersion(ai.version); // Use ">=" if you are INCLUSIVE return aiV.compareTo(version) > 0; } catch (InvalidVersionSpecificationException e) { // do something here? be safe and include? return true; } } }; System.out.println( "Searching for all GAVs with G=org.sonatype.nexus and nexus-api and having V greater than 1.5.0"); final IteratorSearchRequest request = new IteratorSearchRequest(query, Collections.singletonList(centralContext), versionFilter); final IteratorSearchResponse response = indexer.searchIterator(request); for (ArtifactInfo ai : response) { System.out.println(ai.toString()); } // Case: // Use index // Searching for some artifact Query gidQ = indexer.constructQuery(MAVEN.GROUP_ID, new SourcedSearchExpression("org.apache.maven.indexer")); Query aidQ = indexer.constructQuery(MAVEN.ARTIFACT_ID, new SourcedSearchExpression("indexer-artifact")); BooleanQuery bq = new BooleanQuery(); bq.add(gidQ, BooleanClause.Occur.MUST); bq.add(aidQ, BooleanClause.Occur.MUST); searchAndDump(indexer, "all artifacts under GA org.apache.maven.indexer:indexer-artifact", bq); // Searching for some main artifact bq = new BooleanQuery(); bq.add(gidQ, BooleanClause.Occur.MUST); bq.add(aidQ, BooleanClause.Occur.MUST); // bq.add( nexusIndexer.constructQuery( MAVEN.CLASSIFIER, new SourcedSearchExpression( "*" ) ), Occur.MUST_NOT // ); searchAndDump(indexer, "main artifacts under GA org.apache.maven.indexer:indexer-artifact", bq); // doing sha1 search searchAndDump(indexer, "SHA1 7ab67e6b20e5332a7fb4fdf2f019aec4275846c2", indexer.constructQuery(MAVEN.SHA1, new SourcedSearchExpression("7ab67e6b20e5332a7fb4fdf2f019aec4275846c2"))); searchAndDump(indexer, "SHA1 7ab67e6b20 (partial hash)", indexer.constructQuery(MAVEN.SHA1, new UserInputSearchExpression("7ab67e6b20"))); // doing classname search (incomplete classname) searchAndDump(indexer, "classname DefaultNexusIndexer (note: Central does not publish classes in the index)", indexer.constructQuery(MAVEN.CLASSNAMES, new UserInputSearchExpression("DefaultNexusIndexer"))); // doing search for all "canonical" maven plugins latest versions bq = new BooleanQuery(); bq.add(indexer.constructQuery(MAVEN.PACKAGING, new SourcedSearchExpression("maven-plugin")), BooleanClause.Occur.MUST); bq.add(indexer.constructQuery(MAVEN.GROUP_ID, new SourcedSearchExpression("org.apache.maven.plugins")), BooleanClause.Occur.MUST); searchGroupedAndDump(indexer, "all \"canonical\" maven plugins", bq, new GAGrouping()); // close cleanly indexer.closeIndexingContext(centralContext, false); }
From source file:org.musicbrainz.search.index.ReleaseGroupIndexTest.java
License:Open Source License
/** * Basic test of all fields//from w w w . j a v a 2 s . c om * * @throws Exception */ @Test public void testIndexReleaseGroupFields() throws Exception { addReleaseGroupTwo(); RAMDirectory ramDir = new RAMDirectory(); createIndex(ramDir); IndexReader ir = DirectoryReader.open(ramDir); assertEquals(2, ir.numDocs()); { Document doc = ir.document(1); assertEquals(1, doc.getFields(ReleaseGroupIndexField.RELEASEGROUP.getName()).length); assertEquals("Crocodiles", doc.getField(ReleaseGroupIndexField.RELEASEGROUP.getName()).stringValue()); assertEquals("efd2ace2-b3b9-305f-8a53-9803595c0e37", doc.getField(ReleaseGroupIndexField.RELEASEGROUP_ID.getName()).stringValue()); assertEquals(1, doc.getFields(ReleaseGroupIndexField.RELEASE.getName()).length); assertEquals("Crocodiles (bonus disc)", doc.getField(ReleaseGroupIndexField.RELEASE.getName()).stringValue()); checkTerm(ir, ReleaseGroupIndexField.ARTIST_ID, "ccd4879c-5e88-4385-b131-bf65296bf245"); } ir.close(); }
From source file:org.musicbrainz.search.index.ReleaseGroupIndexTest.java
License:Open Source License
/** * Basic test of all fields//www.j a v a 2 s.co m * * @throws Exception */ @Test public void testIndexReleaseGroupAlias() throws Exception { addReleaseGroupTwo(); RAMDirectory ramDir = new RAMDirectory(); createIndex(ramDir); IndexReader ir = DirectoryReader.open(ramDir); assertEquals(2, ir.numDocs()); { Document doc = ir.document(1); assertEquals(1, doc.getFields(ReleaseGroupIndexField.RELEASEGROUP.getName()).length); checkTerm(ir, ReleaseGroupIndexField.ARTIST_NAME, "aliastest"); checkTermX(ir, ReleaseGroupIndexField.ARTIST_NAME, "and", 1); checkTermX(ir, ReleaseGroupIndexField.ARTIST_NAME, "bunnymen", 2); checkTermX(ir, ReleaseGroupIndexField.ARTIST_NAME, "echo", 3); } ir.close(); }
From source file:org.musicbrainz.search.index.ReleaseGroupIndexTest.java
License:Open Source License
@Test public void testIndexReleaseGroupWithType() throws Exception { addReleaseGroupOne();// w w w .j a v a 2s .c om RAMDirectory ramDir = new RAMDirectory(); createIndex(ramDir); IndexReader ir = DirectoryReader.open(ramDir); assertEquals(2, ir.numDocs()); { Document doc = ir.document(1); assertEquals("Album", doc.getField(ReleaseGroupIndexField.PRIMARY_TYPE.getName()).stringValue()); assertEquals(1, doc.getFields(ReleaseGroupIndexField.PRIMARY_TYPE.getName()).length); assertEquals("Album", doc.getField(ReleaseGroupIndexField.TYPE.getName()).stringValue()); assertEquals(1, doc.getFields(ReleaseGroupIndexField.TYPE.getName()).length); assertEquals("Album", doc.getField(ReleaseGroupIndexField.PRIMARY_TYPE.getName()).stringValue()); } ir.close(); }
From source file:org.musicbrainz.search.index.ReleaseGroupIndexTest.java
License:Open Source License
@Test public void testIndexReleaseGroupWithComment() throws Exception { addReleaseGroupOne();//from ww w . ja va2 s. c o m RAMDirectory ramDir = new RAMDirectory(); createIndex(ramDir); IndexReader ir = DirectoryReader.open(ramDir); assertEquals(2, ir.numDocs()); { Document doc = ir.document(1); assertEquals(1, doc.getFields(ReleaseGroupIndexField.COMMENT.getName()).length); assertEquals("demo", doc.getField(ReleaseGroupIndexField.COMMENT.getName()).stringValue()); } ir.close(); }
From source file:org.musicbrainz.search.index.ReleaseGroupIndexTest.java
License:Open Source License
@Test public void testIndexReleaseGroupSortname() throws Exception { addReleaseGroupOne();/* ww w . j av a2s . co m*/ RAMDirectory ramDir = new RAMDirectory(); createIndex(ramDir); IndexReader ir = DirectoryReader.open(ramDir); assertEquals(2, ir.numDocs()); { Document doc = ir.document(1); ArtistCredit ac = ArtistCreditHelper .unserialize(doc.get(ReleaseGroupIndexField.ARTIST_CREDIT.getName())); assertNotNull(ac); assertEquals("Echo and The Bunnymen", ac.getNameCredit().get(0).getArtist().getSortName()); } ir.close(); }