Example usage for org.apache.lucene.index IndexReader getTermVector

List of usage examples for org.apache.lucene.index IndexReader getTermVector

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader getTermVector.

Prototype

public final Terms getTermVector(int docID, String field) throws IOException 

Source Link

Document

Retrieve term vector for this document and field, or null if term vectors were not indexed.

Usage

From source file:ci6226.facetsearch.java

public static void main(String[] args) throws Exception {
    String index = "./myindex";
    String field = "text";
    String queries = null;//from ww  w . j av  a  2  s.  com
    int hitsPerPage = 10;
    boolean raw = false;

    //http://lucene.apache.org/core/4_0_0/facet/org/apache/lucene/facet/doc-files/userguide.html#facet_accumulation

    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    // :Post-Release-Update-Version.LUCENE_XY:

    //TODO: SAME AS HOW U BUILD INDEX
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
    BufferedReader in = null;
    if (queries != null) {
        in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    }
    // :Post-Release-Update-Version.LUCENE_XY:
    QueryParser parser = new QueryParser(Version.LUCENE_47, field, analyzer);
    while (true) {

        System.out.println("Enter query: ");
        String line = in.readLine();
        line = line.trim();
        if (line.length() == 0) {
            break;
        }
        Query query = parser.parse(line);
        System.out.println("Searching for: " + query.toString(field));
        Date start = new Date();
        searcher.search(query, null, 100);
        Date end = new Date();
        System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");
        TopDocs results = searcher.search(query, 5 * hitsPerPage);
        ScoreDoc[] hits = results.scoreDocs;
        int numTotalHits = results.totalHits;

        //N= max docs
        //df = totoal matched doc
        //idf=log(N/df)

        for (int i = 0; i < hits.length; i++) {
            Document doc = searcher.doc(hits[i].doc);
            System.out.println(ANSI_BLUE + (i + 1) + ANSI_RESET + "\nScore=\t" + hits[i].score);
            String rtext = doc.get(field);
            System.out.println("Text=\t" + rtext);

            Terms vector = reader.getTermVector(i, "text");
            if (vector == null)
                continue;
            // System.out.println(vector.getSumDocFreq());

            // Terms vector = reader.getTermVector(hits[i].doc, field);  //hits[i].doc=docID
            TermsEnum termsEnum = vector.iterator(null);
            termsEnum = vector.iterator(termsEnum);
            Map<String, Integer> frequencies = new HashMap<>();
            BytesRef text = null;
            while ((text = termsEnum.next()) != null) {
                String term = text.utf8ToString();
                int freq = (int) termsEnum.totalTermFreq();
                frequencies.put(term, freq);
                // System.out.println("Time: "+term + " idef "+freq);
            }

        }

        //   String[] facetCatlog={""};

        System.out.println(numTotalHits + " total matching documents");

    }

    reader.close();
}

From source file:com.globalsight.ling.lucene.HighFreqTerms.java

License:Apache License

public static void main(String[] args) throws Exception {
    IndexReader reader = null;
    if (args.length == 1) {
        SimpleFSDirectory fsd = new SimpleFSDirectory(new File(args[0]));
        reader = DirectoryReader.open(fsd);
    } else {//  ww  w .  j a v  a2s . com
        usage();
        System.exit(1);
    }

    TermInfoQueue tiq = new TermInfoQueue(numTerms);
    //TODO: IS field right?
    String field = IndexDocument.TEXT;
    Terms terms = reader.getTermVector(0, field);
    //TermEnum terms = reader.terms();
    TermsEnum termsEnum = terms.iterator(null);

    BytesRef next = null;

    while ((next = termsEnum.next()) != null) {
        tiq.insertWithOverflow(new TermInfo(new Term(field, termsEnum.term()), termsEnum.docFreq()));
    }

    while (tiq.size() != 0) {
        TermInfo termInfo = (TermInfo) tiq.pop();
        System.out.println(termInfo.term + " " + termInfo.docFreq);
    }

    reader.close();
}

From source file:com.mathworks.xzheng.advsearching.CategorizerTest.java

License:Apache License

private void buildCategoryVectors() throws IOException {
    IndexReader reader = IndexReader.open(TestUtil.getBookIndexDirectory());

    int maxDoc = reader.maxDoc();

    for (int i = 0; i < maxDoc; i++) {
        if (!reader.document(i)) {
            Document doc = reader.document(i);
            String category = doc.get("category");

            Map vectorMap = (Map) categoryMap.get(category);
            if (vectorMap == null) {
                vectorMap = new TreeMap();
                categoryMap.put(category, vectorMap);
            }//ww w.  j a  v a 2s. com

            Terms terms = reader.getTermVector(i, "subject");

            addTermFreqToMap(vectorMap, terms);
        }
    }
}

From source file:com.o19s.solr.swan.highlight.TermVectorFun.java

License:Apache License

@Test
public void testBlah() throws IOException {
    RAMDirectory ramDir = new RAMDirectory();
    // Index some made up content
    IndexWriterConfig iwf = new IndexWriterConfig(Version.LUCENE_47, new StandardAnalyzer(Version.LUCENE_47));
    IndexWriter writer = new IndexWriter(ramDir, iwf);
    FieldType ft = new FieldType();
    ft.setIndexed(true);/* w w  w  .  j a v a  2 s .  co m*/
    ft.setTokenized(true);
    ft.setStored(true);
    ft.setStoreTermVectorOffsets(true);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(true);
    ft.freeze();
    for (int i = 0; i < DOCS.length; i++) {
        Document doc = new Document();
        StringField id = new StringField("id", "doc_" + i, StringField.Store.YES);
        doc.add(id);
        // Store both position and offset information
        Field text = new Field("content", DOCS[i], ft);
        //               Field.Index.ANALYZED,
        //               Field.TermVector.WITH_POSITIONS_OFFSETS);
        doc.add(text);
        writer.addDocument(doc);
    }
    //writer.close();
    // Get a searcher
    AtomicReader dr = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(writer, true));
    IndexSearcher searcher = new IndexSearcher(dr);
    // Do a search using SpanQuery
    SpanTermQuery fleeceQ = new SpanTermQuery(new Term("content", "fleece"));
    TopDocs results = searcher.search(fleeceQ, 10);
    for (int i = 0; i < results.scoreDocs.length; i++) {
        ScoreDoc scoreDoc = results.scoreDocs[i];
        System.out.println("Score Doc: " + scoreDoc);
    }
    IndexReader reader = searcher.getIndexReader();
    Bits acceptDocs = null;
    Map<Term, TermContext> termContexts = new HashMap<Term, TermContext>();
    Spans spans = fleeceQ.getSpans(dr.getContext(), acceptDocs, termContexts);

    while (spans.next()) {
        System.out.println("Doc: " + spans.doc() + " Start: " + spans.start() + " End: " + spans.end());
        DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor("content");
        reader.document(spans.doc(), visitor);
        Terms terms = reader.getTermVector(spans.doc(), "content");
        TermsEnum tenum = terms.iterator(null);
        //         AttributeSource as = tenum.attributes();

        while (tenum.next() != null) {
            System.out.println(tenum.term().utf8ToString());
        }
        for (long pos = 0L; pos < spans.end(); pos++) {
            //            tenum.next();
            //            if (tenum.ord()<pos) continue;
            //            System.out.println(tenum.term());
            //            
        }

        reader.document(spans.doc(), visitor);
        //         String[] values = visitor.getDocument().getValues("content");
        //         List<String> a = new ArrayList<String>();
        //         // build up the window
        //         tvm.start = spans.start() - window;
        //         tvm.end = spans.end() + window;
        //         reader.getTermFreqVector(spans.doc(), "content", tvm);
        //         for (WindowEntry entry : tvm.entries.values()) {
        //            System.out.println("Entry: " + entry);
        //         }
        //         // clear out the entries for the next round
        //         tvm.entries.clear();
    }
}

From source file:com.romeikat.datamessie.core.processing.service.fulltext.query.QueryUtil.java

License:Open Source License

public List<String> getIndexTerms(final FullTextSession fullTextSession, final int luceneDocumentId,
        final Class<?> clazz, final String field) {
    final IndexReader indexReader = fullTextSession.getSearchFactory().getIndexReaderAccessor().open(clazz);
    try {//from   ww  w . j a va  2 s. c  om
        final Terms terms = indexReader.getTermVector(luceneDocumentId, field);
        final List<String> termsList = Lists.newArrayListWithExpectedSize((int) terms.size());

        final TermsEnum termsEnum = terms.iterator();
        BytesRef text;
        while ((text = termsEnum.next()) != null) {
            final String term = text.utf8ToString();
            termsList.add(term);
        }

        return termsList;
    } catch (final IOException e) {
        LOG.error("Could not determine index terms", e);
        return null;
    }
}

From source file:Dl4j.TermInfo.java

protected Map<String, TermInfo> buildTerms(IndexReader reader, int docId) throws Exception {
    Map<String, TermInfo> wmap = new HashMap<>();
    Terms tfvector;//from   w w w.  ja v  a 2s . c o  m
    TermsEnum termsEnum;
    String termText;
    BytesRef term;

    tfvector = reader.getTermVector(docId, CONTENET_FIELD_NAME);

    // Construct the normalized tf vector
    termsEnum = tfvector.iterator(); // access the terms for this field
    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        termText = term.utf8ToString();

        TermInfo termInfo = wmap.get(termText);
        if (termInfo == null) {
            termInfo = new TermInfo(termText, getTermId(termText));
        }
        termInfo.tf++;
        wmap.put(termText, termInfo);
    }

    return wmap;
}

From source file:edu.cuhk.hccl.cmd.AppSearchEngine.java

License:Apache License

public static void main(String[] args) throws IOException {

    // Get parameters
    CommandLineParser parser = new BasicParser();
    Options options = createOptions();//  w  w w.  j  ava  2s.  c om

    File dataFolder = null;
    String queryStr = null;
    int topK = 0;
    File resultFile = null;
    String queryType = null;
    File similarityFile = null;

    try {
        CommandLine line = parser.parse(options, args);

        dataFolder = new File(line.getOptionValue('d'));
        queryStr = line.getOptionValue('q');
        queryType = line.getOptionValue('t');

        topK = Integer.parseInt(line.getOptionValue('k'));
        resultFile = new File(line.getOptionValue('f'));
        similarityFile = new File(line.getOptionValue('s'));

        if (line.hasOption('m')) {
            String modelPath = line.getOptionValue('m');

            if (queryType.equalsIgnoreCase("WordVector")) {
                expander = new WordVectorExpander(modelPath);
            } else if (queryType.equalsIgnoreCase("WordNet")) {
                expander = new WordNetExpander(modelPath);
            } else {
                System.out.println("Please choose a correct expander: WordNet or WordVector!");
                System.exit(-1);
            }
        }

    } catch (ParseException exp) {
        System.out.println("Error in parameters: \n" + exp.getMessage());
        System.exit(-1);
    }

    // Create Index
    StandardAnalyzer analyzer = new StandardAnalyzer();
    Directory index = createIndex(dataFolder, analyzer);

    // Build query
    Query query = buildQuery(analyzer, queryStr, queryType);

    // Search index for topK hits
    IndexReader reader = DirectoryReader.open(index);
    IndexSearcher searcher = new IndexSearcher(reader);
    TopScoreDocCollector collector = TopScoreDocCollector.create(topK, true);
    searcher.search(query, collector);
    ScoreDoc[] hits = collector.topDocs().scoreDocs;

    // Show search results
    System.out.println("\n[INFO] " + hits.length + " hits were returned:");
    List<String> hitLines = new ArrayList<String>();

    for (int i = 0; i < hits.length; i++) {
        int docId = hits[i].doc;
        Document d = searcher.doc(docId);

        String line = (i + 1) + "\t" + d.get(PATH_FIELD) + "\t" + hits[i].score;

        System.out.println(line);

        hitLines.add(line);
    }

    // Compute cosine similarity between documents
    List<String> simLines = new ArrayList<String>();
    for (int m = 0; m < hits.length; m++) {
        int doc1 = hits[m].doc;
        Terms terms1 = reader.getTermVector(doc1, CONTENT_FIELD);

        for (int n = m + 1; n < hits.length; n++) {
            int doc2 = hits[n].doc;
            Terms terms2 = reader.getTermVector(doc2, CONTENT_FIELD);

            CosineDocumentSimilarity cosine = new CosineDocumentSimilarity(terms1, terms2);
            double similarity = cosine.getCosineSimilarity();
            String line = searcher.doc(doc1).get(PATH_FIELD) + "\t" + searcher.doc(doc2).get(PATH_FIELD) + "\t"
                    + similarity;
            simLines.add(line);
        }
    }

    // Release resources
    reader.close();
    if (expander != null) {
        expander.close();
    }

    // Save search results
    System.out.println("\n[INFO] Search results are saved in file: " + resultFile.getPath());
    FileUtils.writeLines(resultFile, hitLines, false);

    System.out.println("\n[INFO] Cosine similarities are saved in file: " + similarityFile.getPath());
    FileUtils.writeLines(similarityFile, simLines, false);
}

From source file:edu.illinois.cs.cogcomp.bigdata.lucene.Lucene.java

License:Open Source License

/**
 * returns term freq for a given doc.//from   ww w .j  a v  a  2  s.  c  om
 * 
 * @param reader
 * @param field
 * @return
 * @throws IOException
 */
public static Map<String, Float> getTfs(IndexReader reader, String field, int docID) throws IOException {
    Map<String, Float> termFrequencies = new HashMap<>();
    Terms terms = reader.getTermVector(docID, field);
    TermsEnum itr = terms.iterator();
    BytesRef term = null;
    while ((term = itr.next()) != null) {
        String termText = term.utf8ToString();
        long termFreq = itr.totalTermFreq(); // term freq in doc with docID
        termFrequencies.put(termText, (float) termFreq);
    }
    return termFrequencies;
}

From source file:edu.utsa.sifter.Result.java

License:Apache License

public DocTermInfo docRankFactors(final double[] features, final Date refDate, final IndexReader rdr,
        final Set<Term> termSet) throws IOException {
    // J.S.//from   w  ww  .jav  a2  s .  c  om
    final double[] featuresA = new double[19];
    final DocTermInfo ret = new DocTermInfo();
    final String lowerExt = Extension.toLowerCase();
    if (!isUnallocated()) {
        features[HitRanker.FCREATED] = dateDiff(Created, refDate);
        features[HitRanker.FMODIFIED] = dateDiff(Modified, refDate);
        features[HitRanker.FACCESSED] = dateDiff(Accessed, refDate);
        features[HitRanker.FAVG_RECENCY] = (features[HitRanker.FCREATED] + features[HitRanker.FMODIFIED]
                + features[HitRanker.FACCESSED]) / 3;
        features[HitRanker.FFILENAME_DIRECT] = 0;
        features[HitRanker.FFILENAME_INDIRECT] = 0;
        final String fullPath = Path + Name;
        for (Term t : termSet) {
            if (fullPath.indexOf(t.text()) > 0) {
                features[HitRanker.FFILENAME_INDIRECT] = 1;
                break;
            }
        }
        features[HitRanker.FUSER_DIRECTORY] = 0;
        for (String dir : SystemDirs) {
            if (Path.indexOf(dir) > -1) {
                features[HitRanker.FUSER_DIRECTORY] = 1;
                break;
            }
        }
    }
    features[HitRanker.FHIGH_PRIORITY_TYPE] = DocMaker.HighPriorityTypes.contains(lowerExt) ? 1 : 0;
    features[HitRanker.FMED_PRIORITY_TYPE] = DocMaker.MedPriorityTypes.contains(lowerExt) ? 1 : 0;
    features[HitRanker.FLOW_PRIORITY_TYPE] = features[HitRanker.FHIGH_PRIORITY_TYPE]
            + features[HitRanker.FMED_PRIORITY_TYPE] > 0 ? 0 : 1;

    final Terms terms = rdr.getTermVector(LuceneID, "body");
    final TermsEnum term = terms.iterator(null);

    double dotSum = 0, docVecSumSqrs = 0, numDims = 0, queryVecSumSqrs = 0;

    long termCount = 0;

    while (term.next() != null) {
        ++numDims;
        termCount = term.totalTermFreq();
        docVecSumSqrs += termCount * termCount;
        if (termSet.contains(new Term("body", term.term()))) {
            dotSum += termCount;
            ++queryVecSumSqrs;
            ret.TermFreqs.put(BytesRef.deepCopyOf(term.term()), termCount);
            ret.MaxTermFreq = Math.max(ret.MaxTermFreq, termCount);
            // System.err.println(Path + Name + " contains term " + term.term().utf8ToString() + ", with freq " + termCount);
        }
    }
    features[HitRanker.FCOSINE_SIMILARITY] = dotSum / (Math.sqrt(docVecSumSqrs) + Math.sqrt(queryVecSumSqrs));
    features[HitRanker.FTERM_CARDINALITY] = queryVecSumSqrs / termSet.size();

    // features[HitRanker.FTERM_LENGTH] 

    // features[HitRanker.FTERM_PRIORITY] = 0.0;
    return ret;
}

From source file:Evaluator.TermFreq.java

public DocVector(String docid, IndexReader reader, IndexSearcher searcher, String fileName)
        throws IOException, ParseException {
    this.docid = docid;
    this.termVector = new ArrayList<TermFreq>();
    Analyzer analyzer = new KeywordAnalyzer();
    QueryParser parser = new QueryParser("id", analyzer);
    Query query = parser.parse(docid);
    TopDocs topdocs = searcher.search(query, 1);
    int index = topdocs.scoreDocs[0].doc;
    VectorExtractor ve = new VectorExtractor();
    HashMap<String, ArrayList<Double>> h1 = ve.extractVector(fileName);
    vector = h1.get(docid);//  ww w  .  j  a  va  2 s .  c o m

    Terms terms = reader.getTermVector(index, "words");
    TermsEnum termsEnum = null;

    termsEnum = terms.iterator();
    BytesRef term;
    while ((term = termsEnum.next()) != null) {
        TermFreq tf = new TermFreq(term.utf8ToString(), (int) termsEnum.totalTermFreq());
        this.termVector.add(tf);
    }

}