List of usage examples for org.apache.lucene.search IndexSearcher search
public <C extends Collector, T> T search(Query query, CollectorManager<C, T> collectorManager) throws IOException
From source file:com.mathworks.xzheng.tools.BooksMoreLikeThis.java
License:Apache License
public static void main(String[] args) throws Throwable { String indexDir = System.getProperty("index.dir"); FSDirectory directory = FSDirectory.open(new File(indexDir)); IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); int numDocs = reader.maxDoc(); MoreLikeThis mlt = new MoreLikeThis(reader); // #A mlt.setFieldNames(new String[] { "title", "author" }); mlt.setMinTermFreq(1); // #B mlt.setMinDocFreq(1);//from www . j a v a 2 s . c om for (int docID = 0; docID < numDocs; docID++) { // #C System.out.println(); Document doc = reader.document(docID); System.out.println(doc.get("title")); Query query = mlt.like(docID); // #D System.out.println(" query=" + query); TopDocs similarDocs = searcher.search(query, 10); if (similarDocs.totalHits == 0) System.out.println(" None like this"); for (int i = 0; i < similarDocs.scoreDocs.length; i++) { if (similarDocs.scoreDocs[i].doc != docID) { // #E doc = reader.document(similarDocs.scoreDocs[i].doc); System.out.println(" -> " + doc.getField("title").stringValue()); } } } reader.close(); directory.close(); }
From source file:com.mathworks.xzheng.tools.FastVectorHighlighterSample.java
License:Apache License
static void searchIndex(String filename) throws Exception { QueryParser parser = new QueryParser(Version.LUCENE_46, F, analyzer); Query query = parser.parse(QUERY); FastVectorHighlighter highlighter = getHighlighter(); // #C FieldQuery fieldQuery = highlighter.getFieldQuery(query); // #D IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(dir)); TopDocs docs = searcher.search(query, 10); FileWriter writer = new FileWriter(filename); writer.write("<html>"); writer.write("<body>"); writer.write("<p>QUERY : " + QUERY + "</p>"); for (ScoreDoc scoreDoc : docs.scoreDocs) { String snippet = highlighter.getBestFragment( // #E fieldQuery, searcher.getIndexReader(), // #E scoreDoc.doc, F, 100); // #E if (snippet != null) { writer.write(scoreDoc.doc + " : " + snippet + "<br/>"); }/* ww w. j a va 2 s. c om*/ } writer.write("</body></html>"); writer.close(); }
From source file:com.mathworks.xzheng.tools.HighlightTest.java
License:Apache License
public void testHits() throws Exception { IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(TestUtil.getBookIndexDirectory())); TermQuery query = new TermQuery(new Term("title", "action")); TopDocs hits = searcher.search(query, 10); QueryScorer scorer = new QueryScorer(query, "title"); Highlighter highlighter = new Highlighter(scorer); highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer)); Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_46); for (ScoreDoc sd : hits.scoreDocs) { Document doc = searcher.doc(sd.doc); String title = doc.get("title"); TokenStream stream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), sd.doc, "title", doc, analyzer);/*from w w w. j a v a 2 s . c o m*/ String fragment = highlighter.getBestFragment(stream, title); System.out.println(fragment); } }
From source file:com.mathworks.xzheng.tools.RegexQueryTest.java
License:Apache License
public void testRegexQuery() throws Exception { Directory directory = TestUtil.getBookIndexDirectory(); IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(directory)); RegexQuery q = new RegexQuery(new Term("title", ".*st.*")); TopDocs hits = searcher.search(q, 10); assertEquals(2, hits.totalHits);/*from w ww . java2 s . c o m*/ assertTrue(TestUtil.hitsIncludeTitle(searcher, hits, "Tapestry in Action")); assertTrue( TestUtil.hitsIncludeTitle(searcher, hits, "Mindstorms: Children, Computers, And Powerful Ideas")); directory.close(); }
From source file:com.mathworks.xzheng.tools.remote.SearchClient.java
License:Apache License
private static void search(String name, String word) throws Exception { TermQuery query = new TermQuery(new Term("word", word)); IndexSearcher searcher = //2 (IndexSearcher) searcherCache.get(name); //2 if (searcher == null) { searcher = new IndexSearcher(new MultiReader[] { lookupRemote(name) }); //3 searcherCache.put(name, searcher); }/* w ww. j a va2s. com*/ long begin = new Date().getTime(); //4 TopDocs hits = searcher.search(query, 10); //4 long end = new Date().getTime(); //4 System.out.print("Searched " + name + " for '" + word + "' (" + (end - begin) + " ms): "); if (hits.scoreDocs.length == 0) { System.out.print("<NONE FOUND>"); } for (ScoreDoc sd : hits.scoreDocs) { Document doc = searcher.doc(sd.doc); String[] values = doc.getValues("syn"); for (String syn : values) { System.out.print(syn + " "); } } System.out.println(); System.out.println(); // 5 }
From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java
License:Apache License
/** * {@inheritDoc}/*w w w .j a v a 2 s . c o m*/ */ @Override public void train(LeafReader leafReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query) throws IOException { this.textTerms = MultiFields.getTerms(leafReader, textFieldName); if (textTerms == null) { throw new IOException("term vectors need to be available for field " + textFieldName); } this.analyzer = analyzer; this.textFieldName = textFieldName; if (threshold == null || threshold == 0d) { // automatic assign a threshold long sumDocFreq = leafReader.getSumDocFreq(textFieldName); if (sumDocFreq != -1) { this.threshold = (double) sumDocFreq / 2d; } else { throw new IOException("threshold cannot be assigned since term vectors for field " + textFieldName + " do not exist"); } } // TODO : remove this map as soon as we have a writable FST SortedMap<String, Double> weights = new TreeMap<>(); TermsEnum termsEnum = textTerms.iterator(); BytesRef textTerm; while ((textTerm = termsEnum.next()) != null) { weights.put(textTerm.utf8ToString(), (double) termsEnum.totalTermFreq()); } updateFST(weights); IndexSearcher indexSearcher = new IndexSearcher(leafReader); int batchCount = 0; BooleanQuery q = new BooleanQuery(); q.add(new BooleanClause(new WildcardQuery(new Term(classFieldName, "*")), BooleanClause.Occur.MUST)); if (query != null) { q.add(new BooleanClause(query, BooleanClause.Occur.MUST)); } // run the search and use stored field values for (ScoreDoc scoreDoc : indexSearcher.search(q, Integer.MAX_VALUE).scoreDocs) { Document doc = indexSearcher.doc(scoreDoc.doc); IndexableField textField = doc.getField(textFieldName); // get the expected result IndexableField classField = doc.getField(classFieldName); if (textField != null && classField != null) { // assign class to the doc ClassificationResult<Boolean> classificationResult = assignClass(textField.stringValue()); Boolean assignedClass = classificationResult.getAssignedClass(); Boolean correctClass = Boolean.valueOf(classField.stringValue()); long modifier = correctClass.compareTo(assignedClass); if (modifier != 0) { updateWeights(leafReader, scoreDoc.doc, assignedClass, weights, modifier, batchCount % batchSize == 0); } batchCount++; } } weights.clear(); // free memory while waiting for GC }
From source file:com.meizu.nlp.classification.utils.DatasetSplitter.java
License:Apache License
/** * Split a given index into 3 indexes for training, test and cross validation tasks respectively * * @param originalIndex an {@link org.apache.lucene.index.LeafReader} on the source index * @param trainingIndex a {@link Directory} used to write the training index * @param testIndex a {@link Directory} used to write the test index * @param crossValidationIndex a {@link Directory} used to write the cross validation index * @param analyzer {@link Analyzer} used to create the new docs * @param fieldNames names of fields that need to be put in the new indexes or <code>null</code> if all should be used * @throws IOException if any writing operation fails on any of the indexes *//* w w w .ja va 2s . c om*/ public void split(LeafReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, Analyzer analyzer, String... fieldNames) throws IOException { // create IWs for train / test / cv IDXs IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(analyzer)); IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(analyzer)); IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(analyzer)); try { int size = originalIndex.maxDoc(); IndexSearcher indexSearcher = new IndexSearcher(originalIndex); TopDocs topDocs = indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE); // set the type to be indexed, stored, with term vectors FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); int b = 0; // iterate over existing documents for (ScoreDoc scoreDoc : topDocs.scoreDocs) { // create a new document for indexing Document doc = new Document(); if (fieldNames != null && fieldNames.length > 0) { for (String fieldName : fieldNames) { doc.add(new Field(fieldName, originalIndex.document(scoreDoc.doc).getField(fieldName).stringValue(), ft)); } } else { for (IndexableField storableField : originalIndex.document(scoreDoc.doc).getFields()) { if (storableField.readerValue() != null) { doc.add(new Field(storableField.name(), storableField.readerValue(), ft)); } else if (storableField.binaryValue() != null) { doc.add(new Field(storableField.name(), storableField.binaryValue(), ft)); } else if (storableField.stringValue() != null) { doc.add(new Field(storableField.name(), storableField.stringValue(), ft)); } else if (storableField.numericValue() != null) { doc.add(new Field(storableField.name(), storableField.numericValue().toString(), ft)); } } } // add it to one of the IDXs if (b % 2 == 0 && testWriter.maxDoc() < size * testRatio) { testWriter.addDocument(doc); } else if (cvWriter.maxDoc() < size * crossValidationRatio) { cvWriter.addDocument(doc); } else { trainingWriter.addDocument(doc); } b++; } } catch (Exception e) { throw new IOException(e); } finally { testWriter.commit(); cvWriter.commit(); trainingWriter.commit(); // close IWs testWriter.close(); cvWriter.close(); trainingWriter.close(); } }
From source file:com.meizu.nlp.classification.utils.DocToDoubleVectorUtilsTest.java
License:Apache License
@Test public void testDenseFreqDoubleArrayConversion() throws Exception { IndexSearcher indexSearcher = new IndexSearcher(index); for (ScoreDoc scoreDoc : indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE).scoreDocs) { Terms docTerms = index.getTermVector(scoreDoc.doc, "text"); Double[] vector = DocToDoubleVectorUtils.toDenseLocalFreqDoubleArray(docTerms); assertNotNull(vector);/*from www .j av a2 s . c o m*/ assertTrue(vector.length > 0); } }
From source file:com.meizu.nlp.classification.utils.DocToDoubleVectorUtilsTest.java
License:Apache License
@Test public void testSparseFreqDoubleArrayConversion() throws Exception { Terms fieldTerms = MultiFields.getTerms(index, "text"); if (fieldTerms != null && fieldTerms.size() != -1) { IndexSearcher indexSearcher = new IndexSearcher(index); for (ScoreDoc scoreDoc : indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE).scoreDocs) { Terms docTerms = index.getTermVector(scoreDoc.doc, "text"); Double[] vector = DocToDoubleVectorUtils.toSparseLocalFreqDoubleArray(docTerms, fieldTerms); assertNotNull(vector);// w w w . j av a2 s.co m assertTrue(vector.length > 0); } } }
From source file:com.mikeqian.search.SearchFiles.java
License:Apache License
/** * Simple command-line based search demo. */// w w w. ja v a 2s . co m public static void main(String[] args) throws Exception { String index = System.getProperty("java.io.tmpdir", "tmp") + System.getProperty("file.separator") + "index-dir"; System.out.println(index); String field = "contents"; String queries = null; int repeat = 0; boolean raw = false; String queryString = null; int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index))); IndexSearcher searcher = new IndexSearcher(reader); WordConfTools.set("dic.path", "classpath:dic.txt,classpath:dic_CN.txt"); Analyzer analyzer = new ChineseWordAnalyzer(); BufferedReader in = null; if (queries != null) { in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8); } else { in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); } QueryParser parser = new QueryParser(field, analyzer); while (true) { if (queries == null && queryString == null) { // prompt the user System.out.println("Enter query: "); } String line = queryString != null ? queryString : in.readLine(); if (line == null || line.length() == -1) { break; } line = line.trim(); if (line.length() == 0) { break; } Query query = parser.parse(line); System.out.println("Searching for: " + query.toString(field)); if (repeat > 0) { // repeat & time as benchmark Date start = new Date(); for (int i = 0; i < repeat; i++) { searcher.search(query, 100); } Date end = new Date(); System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms"); } doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null); if (queryString != null) { break; } } reader.close(); }