Example usage for org.apache.lucene.search IndexSearcher doc

List of usage examples for org.apache.lucene.search IndexSearcher doc

Introduction

In this page you can find the example usage for org.apache.lucene.search IndexSearcher doc.

Prototype

public Document doc(int docID) throws IOException 

Source Link

Document

Sugar for .getIndexReader().document(docID)

Usage

From source file:luceneexample.LuceneExample.java

private void searchInDirectory(String indexDirectory, String queryString) throws IOException, ParseException {

    System.out.println("Searching for '" + queryString + "'");
    File pathIndexDir = new File(indexDirectory);
    Directory index = FSDirectory.open(pathIndexDir);
    IndexReader reader = IndexReader.open(index);
    IndexSearcher searcher = new IndexSearcher(reader);
    QueryParser queryParser = new QueryParser(Version.LUCENE_36, LuceneConstants.CONTENTS,
            new StandardAnalyzer(Version.LUCENE_36));
    Query query = queryParser.parse(queryString);
    TopDocs tops = searcher.search(query, 30);
    ScoreDoc[] scoreDoc = tops.scoreDocs;
    System.out.println(scoreDoc.length);
    for (ScoreDoc score : scoreDoc) {
        //    System.out.println("DOC " + score.doc + " SCORE " + score.score);
        Document doc = searcher.doc(score.doc);
        System.out.println("File: " + doc.get(LuceneConstants.FILE_NAME));
        searcher.close();/*from w w  w  .ja v a 2 s. c  o  m*/
    }
}

From source file:luceneexamples.IndexAndSearch.java

License:Apache License

@Test
public void index() throws Exception {
    RAMDirectory directory = new RAMDirectory();
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
    IndexWriter writer = new IndexWriter(directory, iwc);

    Document doc = new Document();
    doc.add(new Field("str_field", "quick brown fox jumped over the lazy dog.", Field.Store.YES,
            Field.Index.ANALYZED));
    writer.addDocument(doc);/*from ww  w .  j  av a  2  s  .co  m*/
    Document doc2 = new Document();
    doc2.add(new Field("str_field", "?????", Field.Store.YES,
            Field.Index.ANALYZED));
    writer.addDocument(doc2);
    writer.close();
    IndexSearcher searcher = new IndexSearcher(directory, true);
    QueryParser parser = new QueryParser(Version.LUCENE_31, "str_field", analyzer);
    TopDocs td = searcher.search(parser.parse("fox"), 1000);
    assertThat(td.totalHits, is(1));
    Document doc3 = searcher.doc(td.scoreDocs[0].doc);
    assertEquals("quick brown fox jumped over the lazy dog.", doc3.get("str_field"));
    searcher.close();
    directory.close();
}

From source file:luceneexamples.NumericFieldDocument.java

License:Apache License

@Test
public void index() throws Exception {
    RAMDirectory directory = new RAMDirectory();
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
    IndexWriter writer = new IndexWriter(directory, iwc);

    for (int i = 8; i < 12; i++) {
        Document doc = new Document();
        doc.add(new NumericField("int_field", Field.Store.YES, true).setIntValue(i));
        System.out.println(doc);//  w w w. jav a  2  s . c  om
        writer.addDocument(doc);
    }
    writer.commit();

    IndexReader reader = IndexReader.open(writer, true);
    IndexSearcher searcher = new IndexSearcher(reader);
    TopDocs td = searcher.search(new MatchAllDocsQuery(), 1000,
            new Sort(new SortField("int_field", SortField.INT)));
    assertThat(td.totalHits, is(4));
    assertThat(searcher.doc(td.scoreDocs[0].doc).get("int_field"), equalTo("8"));
    assertThat(searcher.doc(td.scoreDocs[1].doc).get("int_field"), equalTo("9"));
    assertThat(searcher.doc(td.scoreDocs[2].doc).get("int_field"), equalTo("10"));
    assertThat(searcher.doc(td.scoreDocs[3].doc).get("int_field"), equalTo("11"));

    reader.close();
    writer.close();
    searcher.close();
    directory.close();
}

From source file:luceneexamples.SortDocuments.java

License:Apache License

@Test
public void index() throws Exception {
    RAMDirectory directory = new RAMDirectory();
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
    IndexWriter writer = new IndexWriter(directory, iwc);

    Document doc = new Document();
    doc.add(new Field("str_field", "abc", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(doc);//from   w w  w .j  a  v  a2 s  . c om
    Document doc2 = new Document();
    doc2.add(new Field("str_field", "def", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(doc2);
    Document doc3 = new Document();
    doc3.add(new Field("str_field", "hij", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(doc3);
    writer.commit();

    IndexReader reader = IndexReader.open(writer, true);
    IndexSearcher searcher = new IndexSearcher(reader);
    TopDocs td = searcher.search(new MatchAllDocsQuery(), 1000,
            new Sort(new SortField("str_field", SortField.STRING)));
    assertThat(td.totalHits, is(3));
    assertThat(searcher.doc(td.scoreDocs[0].doc).get("str_field"), equalTo("abc"));
    assertThat(searcher.doc(td.scoreDocs[1].doc).get("str_field"), equalTo("def"));
    assertThat(searcher.doc(td.scoreDocs[2].doc).get("str_field"), equalTo("hij"));

    td = searcher.search(new MatchAllDocsQuery(), 1000,
            new Sort(new SortField("str_field", SortField.STRING, true)));
    assertThat(td.totalHits, is(3));
    assertThat(searcher.doc(td.scoreDocs[0].doc).get("str_field"), equalTo("hij"));
    assertThat(searcher.doc(td.scoreDocs[1].doc).get("str_field"), equalTo("def"));
    assertThat(searcher.doc(td.scoreDocs[2].doc).get("str_field"), equalTo("abc"));

    reader.close();
    writer.close();
    searcher.close();
    directory.close();
}

From source file:luceneGazateer.EntryData.java

License:Apache License

public ArrayList<EntryData> searchDocuments(String indexerPath, String inputRecord, DocType recordType)
        throws IOException {

    File indexfile = new File(indexerPath);
    indexDir = FSDirectory.open(indexfile.toPath());

    //inputRecord.replace(","," ");
    if (!DirectoryReader.indexExists(indexDir)) {
        LOG.log(Level.SEVERE, "No Lucene Index Dierctory Found, Invoke indexBuild() First !");
        System.out.println("No Lucene Index Dierctory Found, Invoke indexBuild() First !");
        System.exit(1);//from   w  ww . j a  v a  2 s  .co  m
    }

    IndexReader reader = DirectoryReader.open(indexDir);

    IndexSearcher searcher = new IndexSearcher(reader);

    Query q = null;

    HashMap<String, ArrayList<ArrayList<String>>> allCandidates = new HashMap<String, ArrayList<ArrayList<String>>>();

    if (!allCandidates.containsKey(inputRecord)) {
        try {
            ArrayList<ArrayList<String>> topHits = new ArrayList<ArrayList<String>>();
            //System.out.println("query is : "+inputRecord);
            q = new MultiFieldQueryParser(new String[] { "DATA" }, analyzer).parse(inputRecord);

            TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage);
            searcher.search(q, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;
            for (int i = 0; i < hits.length; ++i) {
                ArrayList<String> tmp1 = new ArrayList<String>();
                int docId = hits[i].doc;
                Document d;
                try {
                    d = searcher.doc(docId);
                    tmp1.add(d.get("ID"));
                    tmp1.add(d.get("DATA"));
                    tmp1.add(((Float) hits[i].score).toString());

                } catch (IOException e) {
                    e.printStackTrace();
                }
                topHits.add(tmp1);
            }
            allCandidates.put(inputRecord, topHits);
        } catch (org.apache.lucene.queryparser.classic.ParseException e) {
            e.printStackTrace();
        }
    }

    ArrayList<EntryData> resolvedEntities = new ArrayList<EntryData>();
    pickBestCandidates(resolvedEntities, allCandidates);
    reader.close();

    return resolvedEntities;

}

From source file:lucenesearch.Mallet.java

public void getMalletOutput() throws IOException {
    int hitsPerPage = 10000000;

    String index = new Searcher().getPostIndexPath();
    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();

    //booleanQuery.add(new QueryParser("Body", analyzer).parse(""), BooleanClause.Occur.MUST);
    booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 2), BooleanClause.Occur.MUST);

    TopDocs results;// w w w .  j a v  a  2 s .  c om

    results = searcher.search(booleanQuery.build(), hitsPerPage);

    ScoreDoc[] hits = results.scoreDocs;

    int numTotalHits = results.totalHits;
    System.out.println(numTotalHits + " total matching documents");

    int start = 0;
    int end = Math.min(numTotalHits, hitsPerPage);

    PrintWriter pw = new PrintWriter("./data/mallet.txt");

    StringBuilder sb = new StringBuilder();
    for (int i = start; i < end; i++) {
        System.out.println("Doc " + i);
        Document doc = searcher.doc(hits[i].doc);
        ArrayList<String> res = LuceneUtils.getAnalyzedRemoveHtml(doc.get("Body"));

        int id = Integer.parseInt(doc.get("SId"));
        sb = new StringBuilder();
        sb.append(id);
        sb.append("\t");
        for (String re : res) {
            re = re.replaceAll("\r\n", " ").replaceAll("\n", " ").replaceAll("<.+?>", "").replaceAll(" +", " ")
                    .replaceAll("[^\\x00-\\x7F]", " ").trim();
            sb.append(re).append(" ");
        }
        sb.append("\n");
        pw.print(sb.toString());

    }
    pw.close();

}

From source file:lucenesearch.NGram.java

public void getNGram(int n, int hitPP) throws IOException, ParseException {
    int hitsPerPage = hitPP;

    String index = new Searcher().getPostIndexPath();
    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();

    //booleanQuery.add(new QueryParser("Body", analyzer).parse(""), BooleanClause.Occur.MUST);
    booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 2), BooleanClause.Occur.MUST);

    TopDocs results;// www  .j  a va  2 s .  co  m

    results = searcher.search(booleanQuery.build(), hitsPerPage);

    ScoreDoc[] hits = results.scoreDocs;

    int numTotalHits = results.totalHits;
    System.out.println(numTotalHits + " total matching documents");

    int start = 0;
    int end = Math.min(numTotalHits, hitsPerPage);

    PrintWriter pw = new PrintWriter("./data/grams/" + n + "gram.csv");

    StringBuilder sb = new StringBuilder();
    for (int i = start; i < end; i++) {
        Document doc = searcher.doc(hits[i].doc);
        ArrayList<String[]> tmp = getNGrams(doc, new ExtendedDocument(hits[i].doc, reader), n);
        for (String[] ngrams : tmp) {
            sb = new StringBuilder();
            sb.append(doc.get("SId"));
            sb.append(",");
            sb.append(toTabbedStr(ngrams));
            sb.append(",");
            ArrayList<String> tagg = tags.get(Integer.parseInt(doc.get("SId")));
            sb.append(implodeTabbed(tagg));
            sb.append("\n");
            if (tagg.size() > 1)
                pw.print(sb.toString());
        }

    }
    pw.close();
}

From source file:lucenesearch.RelevantPostFinder.java

public void saveRelevantPost() throws SQLException, IOException, ParseException {
    String url = "jdbc:mysql://localhost:3306/sof17";
    String username = "root";
    String password = "root";
    String folderPath = "./data/rel_posts/";
    String dupNotFound = "./data/dup_not_exist.txt";
    int hitsPerPage = 10000;

    System.out.println("Connecting database...");

    Connection conn = DriverManager.getConnection(url, username, password);
    System.out.println("Database connected!");
    Statement stmt = conn.createStatement();
    String query = "select PostId,PostBody,OriginalPostId from java_test_data";
    ResultSet rs = stmt.executeQuery(query);

    String index = new Searcher().getPostIndexPath();

    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    //        searcher.setSimilarity(new BM25Similarity(0.05f, 0.03f)); //!!!!!!!!!!!!!!
    searcher.setSimilarity(new BM25Similarity()); //!!!!!!!!!!!!!!

    Analyzer analyzer = new StandardAnalyzer();

    int cnt = 0;/*from   www  .ja  v a  2  s .c o  m*/

    while (rs.next()) {
        System.out.println("Processing post " + (++cnt));

        int postid = rs.getInt("PostId");
        int dupId = rs.getInt("OriginalPostId");
        ArrayList<String> bd = LuceneUtils.getAnalyzedRemoveHtml(rs.getString("PostBody").replace(':', ' '));

        StringBuilder sb = new StringBuilder();
        int j = 0;
        for (String b : bd) {
            if (++j > 600)
                break;
            sb.append(b);
            sb.append(" ");
        }
        String body = sb.toString();

        BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();
        booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 1), BooleanClause.Occur.MUST);
        booleanQuery.add(new QueryParser("Tags", analyzer).parse("java"), BooleanClause.Occur.MUST);
        booleanQuery.add(new QueryParser("Body", analyzer).parse(body), BooleanClause.Occur.MUST);

        TopDocs results;
        results = searcher.search(booleanQuery.build(), hitsPerPage);

        ScoreDoc[] hits = results.scoreDocs;

        int numTotalHits = results.totalHits;
        System.out.println(numTotalHits + " total matching documents");

        int start = 0;
        int end = Math.min(numTotalHits, hitsPerPage);

        PrintWriter out = new PrintWriter(folderPath + postid + ".txt");

        boolean isFound = false;

        for (int i = start; i < end; i++) {
            Document doc = searcher.doc(hits[i].doc);
            int id = Integer.parseInt(doc.get("SId"));
            String s = doc.get("Body");
            if (id == dupId)
                isFound = true;
            out.println(id);
        }
        out.close();

        if (!isFound) {
            System.out.println("Duplicate not found");
            PrintWriter out2 = new PrintWriter(
                    new FileOutputStream(new File(dupNotFound), true /* append = true */));
            out2.println(postid);
            out2.close();
        }

    }
    rs.close();
    stmt.close();
    conn.close();
}

From source file:lucenesearch.TagBodyCount.java

public void calculateWord(String[] bodyTerms, int N) throws IOException, ParseException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(new Searcher().getPostIndexPath())));
    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer = new StandardAnalyzer();

    HashSet<Integer> found = new HashSet<>();
    HashSet<Integer> self = new HashSet<>();

    System.out.println("Calculating word itself: " + searchTag);
    BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();
    booleanQuery.add(new QueryParser("Body", analyzer).parse(searchTag), BooleanClause.Occur.MUST);
    booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 2), BooleanClause.Occur.MUST);

    TopDocs results;/*from w w w  . j a  va2  s.  c o  m*/
    results = searcher.search(booleanQuery.build(), N);

    ScoreDoc[] hits = results.scoreDocs;

    int numTotalHits = results.totalHits;
    System.out.println(numTotalHits + " total matching documents");

    int start = 0;
    int end = Math.min(numTotalHits, N);

    int count = 0;
    int skip = 0;

    for (int i = start; i < end; i++) {
        Document doc = searcher.doc(hits[i].doc);
        if (doc.get("SId") == null) {
            skip++;
            continue;
        }

        int id = Integer.parseInt(doc.get("SId"));
        if (this.acceptedAnswers.contains(id)) {
            self.add(id);
            count++;
        }
    }

    System.out.println("Total Post Cnt = " + count + "/" + this.acceptedAnswers.size());
    System.out.println("Total skipped Post = " + skip);

    for (String bodyTerm : bodyTerms) {
        System.out.println("Query for: " + bodyTerm);
        booleanQuery = new BooleanQuery.Builder();
        booleanQuery.add(new QueryParser("Body", analyzer).parse(bodyTerm), BooleanClause.Occur.MUST);
        //        booleanQuery.add(new QueryParser("Tags", analyzer).parse(this.searchTag), BooleanClause.Occur.MUST);
        booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 2), BooleanClause.Occur.MUST);

        results = searcher.search(booleanQuery.build(), N);

        hits = results.scoreDocs;

        numTotalHits = results.totalHits;
        System.out.println(numTotalHits + " total matching documents");

        start = 0;
        end = Math.min(numTotalHits, N);

        count = 0;
        skip = 0;

        for (int i = start; i < end; i++) {
            Document doc = searcher.doc(hits[i].doc);
            if (doc.get("SId") == null) {
                skip++;
                continue;
            }

            int id = Integer.parseInt(doc.get("SId"));
            if (this.acceptedAnswers.contains(id)) {
                found.add(id);
                count++;
            }
        }
        System.out.println("Total Post Cnt = " + count + "/" + this.acceptedAnswers.size());
        System.out.println("Total skipped Post = " + skip);
        System.out.println("-----------------");
    }
    System.out.println("Self Count = " + self.size() + "/" + this.acceptedAnswers.size());
    System.out.println("Final Count = " + found.size() + "/" + this.acceptedAnswers.size());

    HashSet<Integer> intersect = new HashSet<>();
    intersect.addAll(self);
    intersect.retainAll(found);
    HashSet<Integer> q_only = new HashSet<>();
    q_only.addAll(self);
    q_only.removeAll(found);
    System.out.println("Retrieved by normal query only," + q_only.size());
    HashSet<Integer> tr_only = new HashSet<>();
    tr_only.addAll(found);
    tr_only.removeAll(self);
    System.out.println("Retrieved by translations only," + tr_only.size());
    System.out.println("Retrieved by both methods," + intersect.size());
    HashSet<Integer> diff = new HashSet<>();
    diff.addAll(acceptedAnswers);
    diff.removeAll(self);
    diff.removeAll(found);
    System.out.println("Retrieved by no method," + diff.size());
}

From source file:lucenesearch.TagBodyCount.java

public void calculateCount(String[] bodyTerms, int N) throws IOException, ParseException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(new Searcher().getPostIndexPath())));
    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer = new StandardAnalyzer();

    HashSet<Integer> found = new HashSet<>();
    HashSet<Integer> self = new HashSet<>();

    System.out.println("Calculating word itself: " + searchTag);
    BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();
    booleanQuery.add(new QueryParser("Body", analyzer).parse(searchTag), BooleanClause.Occur.MUST);
    booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 2), BooleanClause.Occur.MUST);

    TopDocs results;/*from w  ww.  j a v a  2  s  .co  m*/
    results = searcher.search(booleanQuery.build(), N);

    ScoreDoc[] hits = results.scoreDocs;

    int numTotalHits = results.totalHits;
    System.out.println(numTotalHits + " total matching documents");

    int start = 0;
    int end = Math.min(numTotalHits, N);

    int count = 0;
    int skip = 0;

    for (int i = start; i < end; i++) {
        Document doc = searcher.doc(hits[i].doc);
        if (doc.get("SId") == null) {
            skip++;
            continue;
        }

        int id = Integer.parseInt(doc.get("SId"));
        if (this.acceptedAnswers.contains(id)) {
            self.add(id);
            count++;
        }
    }

    System.out.println("Total Post Cnt = " + count + "/" + this.acceptedAnswers.size());
    System.out.println("Total skipped Post = " + skip);

    int[] counts = new int[bodyTerms.length];
    int[] accum_counts = new int[bodyTerms.length];
    int cnt = 0;
    for (String bodyTerm : bodyTerms) {
        HashSet<Integer> temp = new HashSet<>();
        System.out.println("Query for: " + bodyTerm);
        booleanQuery = new BooleanQuery.Builder();
        booleanQuery.add(new QueryParser("Body", analyzer).parse(bodyTerm), BooleanClause.Occur.MUST);
        booleanQuery.add(IntPoint.newExactQuery("PostTypeId", 2), BooleanClause.Occur.MUST);

        results = searcher.search(booleanQuery.build(), N);

        hits = results.scoreDocs;

        numTotalHits = results.totalHits;
        System.out.println(numTotalHits + " total matching documents");

        start = 0;
        end = Math.min(numTotalHits, N);

        count = 0;
        skip = 0;

        for (int i = start; i < end; i++) {
            Document doc = searcher.doc(hits[i].doc);
            if (doc.get("SId") == null) {
                skip++;
                continue;
            }

            int id = Integer.parseInt(doc.get("SId"));
            if (this.acceptedAnswers.contains(id)) {
                temp.add(id);
            }
        }
        HashSet<Integer> temp2 = new HashSet<>();
        temp2.addAll(temp);
        temp.removeAll(found);
        temp.removeAll(self);
        found.addAll(temp2);
        counts[cnt] = temp.size();
        accum_counts[cnt] = cnt == 0 ? temp.size() : accum_counts[cnt - 1] + temp.size();
        cnt++;
        System.out.println("Total Post Cnt = " + count + "/" + this.acceptedAnswers.size());
        System.out.println("Total skipped Post = " + skip);
        System.out.println("-----------------");
    }
    System.out.println("-----Final Count-----");
    System.out.println("Self," + ((double) self.size() / acceptedAnswers.size()) * 100);
    for (int i = 0; i < cnt; i++) {
        System.out.println("Tr" + (i + 1) + "," + ((double) counts[i] / acceptedAnswers.size()) * 100);
    }
    System.out.println("-----Final Accum Count-----");
    //        System.out.println("Self,"+((double)self.size()/acceptedAnswers.size())*100);
    //        for (int i = 0; i < cnt; i++)
    //        {
    //            System.out.println("Tr"+(i+1)+","+((double)accum_counts[i]/acceptedAnswers.size())*100);
    //        }
    System.out.println("Cnt,Method,Value");
    for (int i = 0; i < cnt; i++) {
        System.out.println((i + 1) + "," + "Translation" + ","
                + ((double) accum_counts[i] / acceptedAnswers.size()) * 100);
        System.out
                .println((i + 1) + "," + "self" + "," + ((double) self.size() / acceptedAnswers.size()) * 100);
    }

}