Example usage for org.apache.lucene.index Fields terms

List of usage examples for org.apache.lucene.index Fields terms

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Get the Terms for this field.

Usage

From source file:info.boytsov.lucene.FreqWordDict.java

License:Open Source License

public FreqWordDict(IndexReader reader, String fieldName, int minTermFreq, int maxTermQty) throws Exception {

    Fields fields = MultiFields.getFields(reader);
    terms = fields.terms(fieldName);

    TreeSet<TermDesc> tmpTerms = new TreeSet<TermDesc>();

    TermsEnum termIter = terms.iterator(null);

    for (int termId = 0; termIter.next() != null; ++termId) {
        if (termIter.docFreq() >= minTermFreq) {
            TermDesc ts = new TermDesc(fieldName, termId, termIter.term(), termIter.docFreq());

            tmpTerms.add(ts);// w w  w  .  j av a2 s. c  om
        }
    }

    termDescPos = new TreeMap<TermDesc, Integer>();
    termTextPos = new TreeMap<BytesRef, Integer>();

    int pos = 0;
    for (TermDesc ts : tmpTerms) {
        termDescPos.put(ts, pos);
        if (termTextPos.containsKey(ts.text)) {
            throw new Exception("Bug: the key '" + ts.getText() + "' is already in the map!");
        }
        termTextPos.put(ts.text, pos);
        if (++pos >= maxTermQty)
            break;
    }
}

From source file:info.boytsov.lucene.GetTotPostQty.java

License:Open Source License

public static void main(String[] args) {
    if (args.length != 1) {
        printUsage();//  w  w  w  .j  ava2 s  .  c  o  m
        System.exit(1);
    }
    String srcDirName = args[0];

    System.out.println("Source dir: " + srcDirName);

    try {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName)));

        int docQty = reader.maxDoc();

        Fields fields = MultiFields.getFields(reader);
        Terms terms = fields.terms(FIELD_NAME);

        long totalInts = 0;
        int termQty = 0;

        for (TermsEnum termIter = terms.iterator(null); termIter.next() != null;) {
            totalInts += termIter.docFreq();
            //System.out.println(termQty + " -> " + termIter.docFreq());
            ++termQty;
            if (termQty % 1000000 == 0)
                System.out.println("Read " + termQty + " dictionary terms");
        }

        System.out.println("Term qty: " + termQty + " Doc qty: " + docQty + " postings qty: " + totalInts);

    } catch (Exception e) {
        System.err.println("Error: " + e.getMessage());
        e.printStackTrace();
        System.exit(1);
    }

}

From source file:io.anserini.index.IndexUtils.java

License:Apache License

void printIndexStats() throws IOException {
    Fields fields = MultiFields.getFields(reader);
    Terms terms = fields.terms(LuceneDocumentGenerator.FIELD_BODY);

    System.out.println("Index statistics");
    System.out.println("----------------");
    System.out.println("documents:             " + reader.numDocs());
    System.out.println("documents (non-empty): " + reader.getDocCount(LuceneDocumentGenerator.FIELD_BODY));
    System.out.println("unique terms:          " + terms.size());
    System.out.println(//from w  w  w. j  av  a  2s  .  c  o  m
            "total terms:           " + reader.getSumTotalTermFreq(LuceneDocumentGenerator.FIELD_BODY));

    System.out.println("stored fields:");

    FieldInfos fieldInfos = MultiFields.getMergedFieldInfos(reader);
    for (String fd : fields) {
        FieldInfo fi = fieldInfos.fieldInfo(fd);
        System.out.println("  " + fd + " (" + "indexOption: " + fi.getIndexOptions() + ", hasVectors: "
                + fi.hasVectors() + ", hasPayloads: " + fi.hasPayloads() + ")");
    }
}

From source file:io.datalayer.lucene.frequency.AosFrequencyTerms.java

License:Apache License

/**
 * /*from w w w  .  j ava 2s .  c  o m*/
 * @param reader
 * @param numTerms
 * @param field
 * @return TermStats[] ordered by terms with highest docFreq first.
 * @throws Exception
 */
public static AosTermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames)
        throws Exception {
    TermStatsQueue tiq = null;
    TermsEnum te = null;

    if (fieldNames != null) {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            LOGGER.info("Index with no fields - probably empty or corrupted");
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        for (String field : fieldNames) {
            Terms terms = fields.terms(field);
            if (terms != null) {
                te = terms.iterator(te);
                fillQueue(te, tiq, field);
            }
        }
    } else {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            LOGGER.info("Index with no fields - probably empty or corrupted");
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        Iterator<String> fieldsEnum = fields.iterator();
        while (true) {
            /*
             * String field = fieldsEnum.next();
             * 
             * if (field != null) { Terms terms = fieldsEnum.terms(); te =
             * terms.iterator(te); fillQueue(te, tiq, field); } else {
             * break; }
             */}
    }

    AosTermStats[] result = new AosTermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
        result[count] = tiq.pop();
        count--;
    }
    return result;
}

From source file:it.cnr.ilc.lc.clavius.search.Tester.java

private static void searchWithContext(String term) {

    try {/*from  w  w  w.j av  a 2s .c  o m*/
        logger.info("searchWithContext(" + term + ")");
        SpanQuery spanQuery = new SpanTermQuery(new Term("content", term));
        Directory indexDirectory = FSDirectory.open(
                Paths.get("/var/lucene/claviusTest/indexes/it.cnr.ilc.lc.clavius.search.entity.PlainText"));
        DirectoryReader indexReader = DirectoryReader.open(indexDirectory);
        IndexSearcher searcher = new IndexSearcher(indexReader);
        IndexReader reader = searcher.getIndexReader();
        //spanQuery = (SpanQuery) spanQuery.rewrite(reader);
        //SpanWeight weight = (SpanWeight) searcher.createWeight(spanQuery, false);
        Spans spans = spanQuery.createWeight(searcher, false)
                .getSpans(searcher.getIndexReader().leaves().get(0), SpanWeight.Postings.POSITIONS);
        //            Spans spans2 = weight.getSpans(reader.leaves().get(0),
        //                    SpanWeight.Postings.OFFSETS);
        //Spans spans = weight.getSpans(reader.leaves().get(0), SpanWeight.Postings.POSITIONS);
        ScoreDoc[] sc = searcher.search(spanQuery, 10).scoreDocs;

        logger.info("hits :" + sc.length);

        int i;
        if (null != spans) {
            //                while ((nextDoc = spans.nextDoc()) != Spans.NO_MORE_DOCS) {
            for (int k = 0; k < sc.length; k++) {
                int docId = sc[k].doc;
                logger.info("docID: " + docId);
                int newDocID = spans.advance(docId);
                logger.info("newDocID: " + newDocID);

                int nextSpan = -1;
                while ((nextSpan = spans.nextStartPosition()) != Spans.NO_MORE_POSITIONS) {
                    logger.info("nextSpan             : " + nextSpan);
                    logger.info("spans.startPosition(): " + spans.startPosition());
                    logger.info("spans.endPosition()  : " + spans.endPosition());
                    logger.info("spans.width()        : " + spans.width());

                    Fields fields = reader.getTermVectors(docId);
                    Terms terms = fields.terms("content");

                    TermsEnum termsEnum = terms.iterator();
                    BytesRef text;
                    PostingsEnum postingEnum = null;
                    int start = spans.startPosition() - 3;
                    int end = spans.endPosition() + 3;
                    while ((text = termsEnum.next()) != null) {
                        //could store the BytesRef here, but String is easier for this example
                        String s = new String(text.bytes, text.offset, text.length);
                        //                DocsAndPositionsEnum positionsEnum = termsEnum.docsAndPositions(null, null);
                        postingEnum = termsEnum.postings(postingEnum);
                        if (postingEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                            i = 0;
                            int position = -1;
                            while (i < postingEnum.freq() && (position = postingEnum.nextPosition()) != -1) {
                                if (position >= start && position <= end) {
                                    logger.info("pos: " + position + ", term: " + s + " offset: " + text.offset
                                            + " length: " + text.length);
                                }
                                i++;
                            }

                        }

                    }
                }
            }
        } else {
            logger.info("no " + term + " found!");
        }
    } catch (IOException e) {
        logger.error(e.getMessage());
    }
    logger.info("End.");
}

From source file:lucene.searchengine.LuceneSearchEngine.java

public static void main(String[] args) throws IOException {
    System.out.println(//from  w w w . j  av a  2  s.c o m
            "Enter the FULL path where the index will be created: (e.g. /Usr/index or c:\\temp\\index)");

    String indexLocation = null;
    BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
    String s = br.readLine();

    LuceneSearchEngine indexer = null;
    try {
        indexLocation = s;
        indexer = new LuceneSearchEngine(s);
    } catch (Exception ex) {
        System.out.println("Cannot create index..." + ex.getMessage());
        System.exit(-1);
    }

    // ===================================================
    // read input from user until he enters q for quit
    // ===================================================
    while (!s.equalsIgnoreCase("q")) {
        try {
            System.out.println(
                    "Enter the FULL path to add into the index (q=quit): (e.g. /home/mydir/docs or c:\\Users\\mydir\\docs)");
            System.out.println("[Acceptable file types: .xml, .html, .html, .txt]");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
                break;
            }

            // try to add file into the index
            indexer.indexFileOrDirectory(s);
        } catch (Exception e) {
            System.out.println("Error indexing " + s + " : " + e.getMessage());
        }
    }

    // ===================================================
    // after adding, we always have to call the
    // closeIndex, otherwise the index is not created
    // ===================================================
    indexer.closeIndex();

    // =========================================================
    // Now search
    // =========================================================
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation)));

    //===========================================================
    //  GET Term frequency
    //===========================================================
    // Creating a output file to store the term,term_frequency pairs.
    PrintWriter tfwriter = new PrintWriter("..\\term-frequency.csv");

    Fields fields = MultiFields.getFields(reader);
    HashMap<String, Long> tfmap = new HashMap<String, Long>();
    Terms terms = fields.terms("contents");
    TermsEnum termsEnum = terms.iterator(null);
    BytesRef bref = null;
    while ((bref = termsEnum.next()) != null) {
        String term_name = new String(bref.bytes, bref.offset, bref.length);
        Term term_instance = new Term("contents", term_name);
        long termFrequency = reader.totalTermFreq(term_instance);
        tfmap.put(term_name, termFrequency);
    }
    System.out.println(tfmap.size());
    for (String key : tfmap.keySet()) {
        tfwriter.write(key + "," + tfmap.get(key));
        tfwriter.write("\n");
    }
    tfwriter.close();
    //====================================================================
    // Code END to fetch term frequency
    //====================================================================
    IndexSearcher searcher = new IndexSearcher(reader);
    s = "";
    while (!s.equalsIgnoreCase("q")) {
        TopScoreDocCollector collector = TopScoreDocCollector.create(100, true);
        try {
            System.out.println("Enter the search query (q=quit):");
            s = br.readLine();
            if (s.equalsIgnoreCase("q")) {
                break;
            }

            Query q = new QueryParser(Version.LUCENE_47, "contents", sAnalyzer).parse(s);
            searcher.search(q, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;

            // 4. display results
            System.out.println("Found " + hits.length + " hits.");
            for (int i = 0; i < hits.length; ++i) {
                int docId = hits[i].doc;
                Document d = searcher.doc(docId);
                System.out.println((i + 1) + ". " + d.get("filename") + " score=" + hits[i].score);
            }
            // 5. term stats --> watch out for which "version" of the term
            // must be checked here instead!
            Term termInstance = new Term("contents", s);
            long termFreq = reader.totalTermFreq(termInstance);
            long docCount = reader.docFreq(termInstance);
            System.out.println(s + " Term Frequency " + termFreq + " - Document Frequency " + docCount);
        } catch (Exception e) {
            System.out.println("Error searching " + s + " : " + e.getMessage());
            break;
        }
    }
}

From source file:lucene.security.index.SecureAtomicReaderTestBase.java

License:Apache License

@Test
public void testTermWalk() throws IOException, ParseException {
    SecureAtomicReader secureReader = getSecureReader();
    Fields fields = secureReader.fields();
    for (String field : fields) {
        Terms terms = fields.terms(field);
        TermsEnum termsEnum = terms.iterator(null);
        BytesRef ref;// w w  w  .java  2  s.  c o m
        while ((ref = termsEnum.next()) != null) {
            System.out.println(field + " " + ref.utf8ToString());
            DocsEnum docsEnum = termsEnum.docs(null, null);
            int doc;
            while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                System.out.println(field + " " + ref.utf8ToString() + " " + doc);
            }
        }
    }
    secureReader.close();
}

From source file:lucene.security.search.DocumentVisibilityFilter.java

License:Apache License

@Override
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
    AtomicReader reader = context.reader();
    List<DocIdSet> list = new ArrayList<DocIdSet>();

    Fields fields = reader.fields();
    Terms terms = fields.terms(_fieldName);
    if (terms == null) {
        if (acceptDocs instanceof DocIdSet) {
            return (DocIdSet) acceptDocs;
        } else {//from  w  ww . java 2s  . c o  m
            return wrap(acceptDocs);
        }
    }
    TermsEnum iterator = terms.iterator(null);
    BytesRef bytesRef;
    DocumentVisibilityEvaluator visibilityEvaluator = new DocumentVisibilityEvaluator(_authorizations);
    while ((bytesRef = iterator.next()) != null) {
        if (isVisible(visibilityEvaluator, bytesRef)) {
            DocIdSet docIdSet = _filterCacheStrategy.getDocIdSet(_fieldName, bytesRef, reader);
            if (docIdSet != null) {
                list.add(docIdSet);
            } else {
                DocsEnum docsEnum = iterator.docs(acceptDocs, null);
                list.add(buildCache(reader, docsEnum, bytesRef));
            }
        }
    }
    return getLogicalOr(list);
}

From source file:lucenetools.TermData.java

License:Apache License

/**
 * Extract all terms from the indexed "contents" field.  Ignore
 * any terms with freq counts less than the minTermFreq value or 
 * greater than the maxPercentage value. Assign each term a 
 * dictionary index./*  w  w w.j  a va  2s. c  o m*/
 *
 * @param  reader        reader for index
 * @param  dictMap       mapping of index to term
 * @param  minTermFreq   minimum value for frequency else pruned
 * @param  numDocs       total number of documents in corpus
 * @param  maxPercentage maximum value for percentage else pruned
 * @return boolean       whether or not there was an error
 */
public static boolean extractTerms(IndexReader reader, Map<String, Integer> dictMap, int minTermFreq,
        int numDocs, int maxPercentage) {
    // a TreeSet sorts the dictionary strings
    Set<String> dictionary = new TreeSet<>();
    int maxOccurrences = (int) (((double) maxPercentage / 100.0) * (double) numDocs);
    try {
        Fields fields = MultiFields.getFields(reader);
        if (null != fields) {
            // get terms derived from content
            Terms terms = fields.terms(CONTENTSFIELD);
            TermsEnum te = terms.iterator(null);

            while (te.next() != null) {
                String term_str = te.term().utf8ToString();

                // number of occurrences of this term in all docs (rowsum)
                int total_term_freq = (int) (te.totalTermFreq());
                if (total_term_freq >= minTermFreq) {
                    if (maxPercentage != 100) {
                        if (total_term_freq <= maxOccurrences) {
                            dictionary.add(term_str);
                        }
                    } else {
                        dictionary.add(term_str);
                    }

                }
            }
        } else {
            System.err.println("No fields found in index.");
            return false;
        }
    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
        return false;
    }

    // build a map associating the dictionary index with each term
    int index = 0;
    dictMap.clear();
    for (String s : dictionary)
        dictMap.put(s, index++);

    return true;
}

From source file:lux.IndexTestSupport.java

License:Mozilla Public License

public static void printAllTerms(Directory dir, XmlIndexer indexer) throws IOException {
    DirectoryReader reader = DirectoryReader.open(dir);
    Fields fields = MultiFields.getFields(reader);
    System.out.println("Printing all terms (except uri)");
    String uriFieldName = indexer.getConfiguration().getFieldName(FieldRole.URI);
    for (String field : fields) {
        if (field.equals(uriFieldName)) {
            continue;
        }//from   w  w w .jav  a2  s .co  m
        Terms terms = fields.terms(field);
        TermsEnum termsEnum = terms.iterator(null);
        BytesRef text;
        int count = 0;
        while ((text = termsEnum.next()) != null && count++ < 100) {
            System.out.println(field + " " + text.utf8ToString() + ' ' + termsEnum.docFreq());
        }
    }
    reader.close();
}