List of usage examples for org.apache.lucene.search IndexSearcher doc
public Document doc(int docID) throws IOException
.getIndexReader().document(docID)
From source file:luceneexample.LuceneExample.java
private void searchInDirectory(String indexDirectory, String queryString) throws IOException, ParseException { System.out.println("Searching for '" + queryString + "'"); File pathIndexDir = new File(indexDirectory); Directory index = FSDirectory.open(pathIndexDir); IndexReader reader = IndexReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); QueryParser queryParser = new QueryParser(Version.LUCENE_36, LuceneConstants.CONTENTS, new StandardAnalyzer(Version.LUCENE_36)); Query query = queryParser.parse(queryString); TopDocs tops = searcher.search(query, 30); ScoreDoc[] scoreDoc = tops.scoreDocs; System.out.println(scoreDoc.length); for (ScoreDoc score : scoreDoc) { // System.out.println("DOC " + score.doc + " SCORE " + score.score); Document doc = searcher.doc(score.doc); System.out.println("File: " + doc.get(LuceneConstants.FILE_NAME)); searcher.close();/*from w w w .ja v a 2 s. c o m*/ } }
From source file:luceneexamples.IndexAndSearch.java
License:Apache License
@Test public void index() throws Exception { RAMDirectory directory = new RAMDirectory(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); IndexWriter writer = new IndexWriter(directory, iwc); Document doc = new Document(); doc.add(new Field("str_field", "quick brown fox jumped over the lazy dog.", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc);/*from ww w . j av a 2 s .co m*/ Document doc2 = new Document(); doc2.add(new Field("str_field", "?????", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc2); writer.close(); IndexSearcher searcher = new IndexSearcher(directory, true); QueryParser parser = new QueryParser(Version.LUCENE_31, "str_field", analyzer); TopDocs td = searcher.search(parser.parse("fox"), 1000); assertThat(td.totalHits, is(1)); Document doc3 = searcher.doc(td.scoreDocs[0].doc); assertEquals("quick brown fox jumped over the lazy dog.", doc3.get("str_field")); searcher.close(); directory.close(); }
From source file:luceneexamples.NumericFieldDocument.java
License:Apache License
@Test public void index() throws Exception { RAMDirectory directory = new RAMDirectory(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); IndexWriter writer = new IndexWriter(directory, iwc); for (int i = 8; i < 12; i++) { Document doc = new Document(); doc.add(new NumericField("int_field", Field.Store.YES, true).setIntValue(i)); System.out.println(doc);// w w w. jav a 2 s . c om writer.addDocument(doc); } writer.commit(); IndexReader reader = IndexReader.open(writer, true); IndexSearcher searcher = new IndexSearcher(reader); TopDocs td = searcher.search(new MatchAllDocsQuery(), 1000, new Sort(new SortField("int_field", SortField.INT))); assertThat(td.totalHits, is(4)); assertThat(searcher.doc(td.scoreDocs[0].doc).get("int_field"), equalTo("8")); assertThat(searcher.doc(td.scoreDocs[1].doc).get("int_field"), equalTo("9")); assertThat(searcher.doc(td.scoreDocs[2].doc).get("int_field"), equalTo("10")); assertThat(searcher.doc(td.scoreDocs[3].doc).get("int_field"), equalTo("11")); reader.close(); writer.close(); searcher.close(); directory.close(); }
From source file:luceneexamples.SortDocuments.java
License:Apache License
@Test public void index() throws Exception { RAMDirectory directory = new RAMDirectory(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); IndexWriter writer = new IndexWriter(directory, iwc); Document doc = new Document(); doc.add(new Field("str_field", "abc", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc);//from w w w .j a v a2 s . c om Document doc2 = new Document(); doc2.add(new Field("str_field", "def", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc2); Document doc3 = new Document(); doc3.add(new Field("str_field", "hij", Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc3); writer.commit(); IndexReader reader = IndexReader.open(writer, true); IndexSearcher searcher = new IndexSearcher(reader); TopDocs td = searcher.search(new MatchAllDocsQuery(), 1000, new Sort(new SortField("str_field", SortField.STRING))); assertThat(td.totalHits, is(3)); assertThat(searcher.doc(td.scoreDocs[0].doc).get("str_field"), equalTo("abc")); assertThat(searcher.doc(td.scoreDocs[1].doc).get("str_field"), equalTo("def")); assertThat(searcher.doc(td.scoreDocs[2].doc).get("str_field"), equalTo("hij")); td = searcher.search(new MatchAllDocsQuery(), 1000, new Sort(new SortField("str_field", SortField.STRING, true))); assertThat(td.totalHits, is(3)); assertThat(searcher.doc(td.scoreDocs[0].doc).get("str_field"), equalTo("hij")); assertThat(searcher.doc(td.scoreDocs[1].doc).get("str_field"), equalTo("def")); assertThat(searcher.doc(td.scoreDocs[2].doc).get("str_field"), equalTo("abc")); reader.close(); writer.close(); searcher.close(); directory.close(); }
From source file:luceneGazateer.EntryData.java
License:Apache License
public ArrayList<EntryData> searchDocuments(String indexerPath, String inputRecord, DocType recordType) throws IOException { File indexfile = new File(indexerPath); indexDir = FSDirectory.open(indexfile.toPath()); //inputRecord.replace(","," "); if (!DirectoryReader.indexExists(indexDir)) { LOG.log(Level.SEVERE, "No Lucene Index Dierctory Found, Invoke indexBuild() First !"); System.out.println("No Lucene Index Dierctory Found, Invoke indexBuild() First !"); System.exit(1);//from w ww . j a v a 2 s .co m } IndexReader reader = DirectoryReader.open(indexDir); IndexSearcher searcher = new IndexSearcher(reader); Query q = null; HashMap<String, ArrayList<ArrayList<String>>> allCandidates = new HashMap<String, ArrayList<ArrayList<String>>>(); if (!allCandidates.containsKey(inputRecord)) { try { ArrayList<ArrayList<String>> topHits = new ArrayList<ArrayList<String>>(); //System.out.println("query is : "+inputRecord); q = new MultiFieldQueryParser(new String[] { "DATA" }, analyzer).parse(inputRecord); TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; for (int i = 0; i < hits.length; ++i) { ArrayList<String> tmp1 = new ArrayList<String>(); int docId = hits[i].doc; Document d; try { d = searcher.doc(docId); tmp1.add(d.get("ID")); tmp1.add(d.get("DATA")); tmp1.add(((Float) hits[i].score).toString()); } catch (IOException e) { e.printStackTrace(); } topHits.add(tmp1); } allCandidates.put(inputRecord, topHits); } catch (org.apache.lucene.queryparser.classic.ParseException e) { e.printStackTrace(); } } ArrayList<EntryData> resolvedEntities = new ArrayList<EntryData>(); pickBestCandidates(resolvedEntities, allCandidates); reader.close(); return resolvedEntities; }
From source file:lucenesearch.Mallet.java
public void getMalletOutput() throws IOException { int hitsPerPage = 10000000; String index = new Searcher().getPostIndexPath(); IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index))); IndexSearcher searcher = new IndexSearcher(reader); BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder(); //booleanQuery.add(new QueryParser("Body", analyzer).parse(""), BooleanClause.Occur.MUST); booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 2), BooleanClause.Occur.MUST); TopDocs results;// w w w . j a v a 2 s . c om results = searcher.search(booleanQuery.build(), hitsPerPage); ScoreDoc[] hits = results.scoreDocs; int numTotalHits = results.totalHits; System.out.println(numTotalHits + " total matching documents"); int start = 0; int end = Math.min(numTotalHits, hitsPerPage); PrintWriter pw = new PrintWriter("./data/mallet.txt"); StringBuilder sb = new StringBuilder(); for (int i = start; i < end; i++) { System.out.println("Doc " + i); Document doc = searcher.doc(hits[i].doc); ArrayList<String> res = LuceneUtils.getAnalyzedRemoveHtml(doc.get("Body")); int id = Integer.parseInt(doc.get("SId")); sb = new StringBuilder(); sb.append(id); sb.append("\t"); for (String re : res) { re = re.replaceAll("\r\n", " ").replaceAll("\n", " ").replaceAll("<.+?>", "").replaceAll(" +", " ") .replaceAll("[^\\x00-\\x7F]", " ").trim(); sb.append(re).append(" "); } sb.append("\n"); pw.print(sb.toString()); } pw.close(); }
From source file:lucenesearch.NGram.java
public void getNGram(int n, int hitPP) throws IOException, ParseException { int hitsPerPage = hitPP; String index = new Searcher().getPostIndexPath(); IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index))); IndexSearcher searcher = new IndexSearcher(reader); BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder(); //booleanQuery.add(new QueryParser("Body", analyzer).parse(""), BooleanClause.Occur.MUST); booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 2), BooleanClause.Occur.MUST); TopDocs results;// www .j a va 2 s . co m results = searcher.search(booleanQuery.build(), hitsPerPage); ScoreDoc[] hits = results.scoreDocs; int numTotalHits = results.totalHits; System.out.println(numTotalHits + " total matching documents"); int start = 0; int end = Math.min(numTotalHits, hitsPerPage); PrintWriter pw = new PrintWriter("./data/grams/" + n + "gram.csv"); StringBuilder sb = new StringBuilder(); for (int i = start; i < end; i++) { Document doc = searcher.doc(hits[i].doc); ArrayList<String[]> tmp = getNGrams(doc, new ExtendedDocument(hits[i].doc, reader), n); for (String[] ngrams : tmp) { sb = new StringBuilder(); sb.append(doc.get("SId")); sb.append(","); sb.append(toTabbedStr(ngrams)); sb.append(","); ArrayList<String> tagg = tags.get(Integer.parseInt(doc.get("SId"))); sb.append(implodeTabbed(tagg)); sb.append("\n"); if (tagg.size() > 1) pw.print(sb.toString()); } } pw.close(); }
From source file:lucenesearch.RelevantPostFinder.java
public void saveRelevantPost() throws SQLException, IOException, ParseException { String url = "jdbc:mysql://localhost:3306/sof17"; String username = "root"; String password = "root"; String folderPath = "./data/rel_posts/"; String dupNotFound = "./data/dup_not_exist.txt"; int hitsPerPage = 10000; System.out.println("Connecting database..."); Connection conn = DriverManager.getConnection(url, username, password); System.out.println("Database connected!"); Statement stmt = conn.createStatement(); String query = "select PostId,PostBody,OriginalPostId from java_test_data"; ResultSet rs = stmt.executeQuery(query); String index = new Searcher().getPostIndexPath(); IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index))); IndexSearcher searcher = new IndexSearcher(reader); // searcher.setSimilarity(new BM25Similarity(0.05f, 0.03f)); //!!!!!!!!!!!!!! searcher.setSimilarity(new BM25Similarity()); //!!!!!!!!!!!!!! Analyzer analyzer = new StandardAnalyzer(); int cnt = 0;/*from www .ja v a 2 s .c o m*/ while (rs.next()) { System.out.println("Processing post " + (++cnt)); int postid = rs.getInt("PostId"); int dupId = rs.getInt("OriginalPostId"); ArrayList<String> bd = LuceneUtils.getAnalyzedRemoveHtml(rs.getString("PostBody").replace(':', ' ')); StringBuilder sb = new StringBuilder(); int j = 0; for (String b : bd) { if (++j > 600) break; sb.append(b); sb.append(" "); } String body = sb.toString(); BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder(); booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 1), BooleanClause.Occur.MUST); booleanQuery.add(new QueryParser("Tags", analyzer).parse("java"), BooleanClause.Occur.MUST); booleanQuery.add(new QueryParser("Body", analyzer).parse(body), BooleanClause.Occur.MUST); TopDocs results; results = searcher.search(booleanQuery.build(), hitsPerPage); ScoreDoc[] hits = results.scoreDocs; int numTotalHits = results.totalHits; System.out.println(numTotalHits + " total matching documents"); int start = 0; int end = Math.min(numTotalHits, hitsPerPage); PrintWriter out = new PrintWriter(folderPath + postid + ".txt"); boolean isFound = false; for (int i = start; i < end; i++) { Document doc = searcher.doc(hits[i].doc); int id = Integer.parseInt(doc.get("SId")); String s = doc.get("Body"); if (id == dupId) isFound = true; out.println(id); } out.close(); if (!isFound) { System.out.println("Duplicate not found"); PrintWriter out2 = new PrintWriter( new FileOutputStream(new File(dupNotFound), true /* append = true */)); out2.println(postid); out2.close(); } } rs.close(); stmt.close(); conn.close(); }
From source file:lucenesearch.TagBodyCount.java
public void calculateWord(String[] bodyTerms, int N) throws IOException, ParseException { IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(new Searcher().getPostIndexPath()))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(); HashSet<Integer> found = new HashSet<>(); HashSet<Integer> self = new HashSet<>(); System.out.println("Calculating word itself: " + searchTag); BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder(); booleanQuery.add(new QueryParser("Body", analyzer).parse(searchTag), BooleanClause.Occur.MUST); booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 2), BooleanClause.Occur.MUST); TopDocs results;/*from w w w . j a va2 s. c o m*/ results = searcher.search(booleanQuery.build(), N); ScoreDoc[] hits = results.scoreDocs; int numTotalHits = results.totalHits; System.out.println(numTotalHits + " total matching documents"); int start = 0; int end = Math.min(numTotalHits, N); int count = 0; int skip = 0; for (int i = start; i < end; i++) { Document doc = searcher.doc(hits[i].doc); if (doc.get("SId") == null) { skip++; continue; } int id = Integer.parseInt(doc.get("SId")); if (this.acceptedAnswers.contains(id)) { self.add(id); count++; } } System.out.println("Total Post Cnt = " + count + "/" + this.acceptedAnswers.size()); System.out.println("Total skipped Post = " + skip); for (String bodyTerm : bodyTerms) { System.out.println("Query for: " + bodyTerm); booleanQuery = new BooleanQuery.Builder(); booleanQuery.add(new QueryParser("Body", analyzer).parse(bodyTerm), BooleanClause.Occur.MUST); // booleanQuery.add(new QueryParser("Tags", analyzer).parse(this.searchTag), BooleanClause.Occur.MUST); booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 2), BooleanClause.Occur.MUST); results = searcher.search(booleanQuery.build(), N); hits = results.scoreDocs; numTotalHits = results.totalHits; System.out.println(numTotalHits + " total matching documents"); start = 0; end = Math.min(numTotalHits, N); count = 0; skip = 0; for (int i = start; i < end; i++) { Document doc = searcher.doc(hits[i].doc); if (doc.get("SId") == null) { skip++; continue; } int id = Integer.parseInt(doc.get("SId")); if (this.acceptedAnswers.contains(id)) { found.add(id); count++; } } System.out.println("Total Post Cnt = " + count + "/" + this.acceptedAnswers.size()); System.out.println("Total skipped Post = " + skip); System.out.println("-----------------"); } System.out.println("Self Count = " + self.size() + "/" + this.acceptedAnswers.size()); System.out.println("Final Count = " + found.size() + "/" + this.acceptedAnswers.size()); HashSet<Integer> intersect = new HashSet<>(); intersect.addAll(self); intersect.retainAll(found); HashSet<Integer> q_only = new HashSet<>(); q_only.addAll(self); q_only.removeAll(found); System.out.println("Retrieved by normal query only," + q_only.size()); HashSet<Integer> tr_only = new HashSet<>(); tr_only.addAll(found); tr_only.removeAll(self); System.out.println("Retrieved by translations only," + tr_only.size()); System.out.println("Retrieved by both methods," + intersect.size()); HashSet<Integer> diff = new HashSet<>(); diff.addAll(acceptedAnswers); diff.removeAll(self); diff.removeAll(found); System.out.println("Retrieved by no method," + diff.size()); }
From source file:lucenesearch.TagBodyCount.java
public void calculateCount(String[] bodyTerms, int N) throws IOException, ParseException { IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(new Searcher().getPostIndexPath()))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(); HashSet<Integer> found = new HashSet<>(); HashSet<Integer> self = new HashSet<>(); System.out.println("Calculating word itself: " + searchTag); BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder(); booleanQuery.add(new QueryParser("Body", analyzer).parse(searchTag), BooleanClause.Occur.MUST); booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 2), BooleanClause.Occur.MUST); TopDocs results;/*from w ww. j a v a 2 s .co m*/ results = searcher.search(booleanQuery.build(), N); ScoreDoc[] hits = results.scoreDocs; int numTotalHits = results.totalHits; System.out.println(numTotalHits + " total matching documents"); int start = 0; int end = Math.min(numTotalHits, N); int count = 0; int skip = 0; for (int i = start; i < end; i++) { Document doc = searcher.doc(hits[i].doc); if (doc.get("SId") == null) { skip++; continue; } int id = Integer.parseInt(doc.get("SId")); if (this.acceptedAnswers.contains(id)) { self.add(id); count++; } } System.out.println("Total Post Cnt = " + count + "/" + this.acceptedAnswers.size()); System.out.println("Total skipped Post = " + skip); int[] counts = new int[bodyTerms.length]; int[] accum_counts = new int[bodyTerms.length]; int cnt = 0; for (String bodyTerm : bodyTerms) { HashSet<Integer> temp = new HashSet<>(); System.out.println("Query for: " + bodyTerm); booleanQuery = new BooleanQuery.Builder(); booleanQuery.add(new QueryParser("Body", analyzer).parse(bodyTerm), BooleanClause.Occur.MUST); booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 2), BooleanClause.Occur.MUST); results = searcher.search(booleanQuery.build(), N); hits = results.scoreDocs; numTotalHits = results.totalHits; System.out.println(numTotalHits + " total matching documents"); start = 0; end = Math.min(numTotalHits, N); count = 0; skip = 0; for (int i = start; i < end; i++) { Document doc = searcher.doc(hits[i].doc); if (doc.get("SId") == null) { skip++; continue; } int id = Integer.parseInt(doc.get("SId")); if (this.acceptedAnswers.contains(id)) { temp.add(id); } } HashSet<Integer> temp2 = new HashSet<>(); temp2.addAll(temp); temp.removeAll(found); temp.removeAll(self); found.addAll(temp2); counts[cnt] = temp.size(); accum_counts[cnt] = cnt == 0 ? temp.size() : accum_counts[cnt - 1] + temp.size(); cnt++; System.out.println("Total Post Cnt = " + count + "/" + this.acceptedAnswers.size()); System.out.println("Total skipped Post = " + skip); System.out.println("-----------------"); } System.out.println("-----Final Count-----"); System.out.println("Self," + ((double) self.size() / acceptedAnswers.size()) * 100); for (int i = 0; i < cnt; i++) { System.out.println("Tr" + (i + 1) + "," + ((double) counts[i] / acceptedAnswers.size()) * 100); } System.out.println("-----Final Accum Count-----"); // System.out.println("Self,"+((double)self.size()/acceptedAnswers.size())*100); // for (int i = 0; i < cnt; i++) // { // System.out.println("Tr"+(i+1)+","+((double)accum_counts[i]/acceptedAnswers.size())*100); // } System.out.println("Cnt,Method,Value"); for (int i = 0; i < cnt; i++) { System.out.println((i + 1) + "," + "Translation" + "," + ((double) accum_counts[i] / acceptedAnswers.size()) * 100); System.out .println((i + 1) + "," + "self" + "," + ((double) self.size() / acceptedAnswers.size()) * 100); } }