Example usage for org.apache.lucene.index Term field

List of usage examples for org.apache.lucene.index Term field

Introduction

In this page you can find the example usage for org.apache.lucene.index Term field.

Prototype

String field

To view the source code for org.apache.lucene.index Term field.

Click Source Link

Usage

From source file:RangeFilter.java

License:Apache License

/**
 * Returns a BitSet with true for documents which should be
 * permitted in search results, and false for those that should
 * not.//from w  ww . java  2 s .  c  o  m
 */
public BitSet bits(IndexReader reader) throws IOException {
    BitSet bits = new BitSet(reader.maxDoc());
    TermEnum enumerator = (null != lowerTerm ? reader.terms(new Term(fieldName, lowerTerm))
            : reader.terms(new Term(fieldName, "")));

    try {

        if (enumerator.term() == null) {
            return bits;
        }

        boolean checkLower = false;
        if (!includeLower) // make adjustments to set to exclusive
            checkLower = true;

        TermDocs termDocs = reader.termDocs();
        try {

            do {
                Term term = enumerator.term();
                if (term != null && term.field().equals(fieldName)) {
                    if (!checkLower || null == lowerTerm || term.text().compareTo(lowerTerm) > 0) {
                        checkLower = false;
                        if (upperTerm != null) {
                            int compare = upperTerm.compareTo(term.text());
                            /* if beyond the upper term, or is exclusive and
                             * this is equal to the upper term, break out */
                            if ((compare < 0) || (!includeUpper && compare == 0)) {
                                break;
                            }
                        }
                        /* we have a good term, find the docs */

                        termDocs.seek(enumerator.term());
                        while (termDocs.next()) {
                            bits.set(termDocs.doc());
                        }
                    }
                } else {
                    break;
                }
            } while (enumerator.next());

        } finally {
            termDocs.close();
        }
    } finally {
        enumerator.close();
    }

    return bits;
}

From source file:CountWords3.java

License:Apache License

public static void main(String[] args) throws Exception {

    if (args.length != 1) {
        String usage = "Usage:\tjava CountWords3 indexdir";
        System.out.println(usage);
        System.exit(0);// w ww  . j a  v  a 2 s.com
    }

    String index = args[0];
    String field = "contents";
    String queries = null;

    IndexReader reader = IndexReader.open(FSDirectory.open(new File(index)));
    TermEnum te = reader.terms();

    int totalOccs = 0;
    int i = 0;
    boolean notlastt = te.next();
    Map<String, Integer> terms = new HashMap<>();

    if (removeStopWords) {
        File stopWordsFile = new File("./src/org/apache/lucene/demo/stopwords_long_EN.txt");
        setStopWords(stopWordsFile);
    }

    while (notlastt) {
        Term t = te.term();
        if (t.field().equals(field)) { // ignore if not desired field
            if (!removeTerm(t.text())) {
                if (!removeStopWords || !stopWords.contains(t.text())) {
                    TermDocs td = reader.termDocs(t);
                    int n = 0;
                    boolean notlastd = td.next();
                    while (notlastd) {
                        n += td.freq();
                        notlastd = td.next();
                    }
                    String text = t.text();
                    //String text = removePunctuation(t.text());
                    if (useStemmer)
                        text = porterStemming(text);
                    Integer value;
                    if (terms.containsKey(text)) {
                        value = n + terms.get(text);
                    } else {
                        value = n;
                    }
                    terms.put(text, value);
                    //System.out.println(t.text() + " " + n);
                    totalOccs += n;
                    ++i;
                }
            }
        }
        notlastt = te.next();
    }
    terms = sortByComparator(terms, false);
    int j = 1;
    /*for (Map.Entry entry : terms.entrySet()) {
    System.out.print(entry.getKey() + "   ");
    System.out.print(entry.getValue() + "   ");
    System.out.println(j);
    j += 1;
    }*/
    System.out.println("Distinct words: " + terms.size() + "; Word occurrences: " + totalOccs);
}

From source file:br.ufrgs.inf.dsmoura.repository.controller.solr.SolrConversionUtil.java

private static String fromTermsToQueryText(List<Term> terms) {
    if (terms.size() == 0) {
        throw new IllegalArgumentException("Empty terms.");
    }// w  w  w  . jav  a2  s.  c o m
    String query = "";

    for (Term t : terms) {
        query += t.text();
        float boost = SolrField.getFieldByName(t.field()).getBoost();
        if (boost != SolrFieldBoost.DEFAULT_BOOST) {
            query += "^" + boost;
        }
        query += " OR ";
    }
    return query.substring(0, query.lastIndexOf(" OR "));
}

From source file:ca.gnewton.lusql.core.IndexTermFreqCache.java

License:Apache License

public IndexTermFreqCache(final IndexReader newReader, final String newFieldName, int initSize,
        boolean newPreload) throws IOException {
    setPreload(newPreload);/*from w  ww .  jav  a2s  .c  o  m*/
    setReader(newReader);
    setFieldName(newFieldName);
    cache = new HashMap<String, Integer>(initSize);
    if (preload) {
        TermEnum te = reader.terms();
        while (te.next()) {
            Term term = te.term();
            /*
            System.out.println(te.term().field()
                     + ": "
                     + te.term().text()
                     + ": "
                     + reader.docFreq(term));
            */
            if (term.field().equals(fieldName)) {
                cache.put(te.term().text(), new Integer(reader.docFreq(term)));

            }
        }
    }
}

From source file:ch.ymc.lucehbase.LucandraTermEnum.java

License:Apache License

private void loadTerms(Term skipTo) throws IOException {
    // chose starting term
    String startTerm = indexName + HBaseUtils.delimeter + HBaseUtils.createColumnName(skipTo);
    // this is where we stop;
    String endTerm = indexName + HBaseUtils.delimeter + skipTo.field().substring(0, skipTo.field().length() - 1)
            + new Character((char) (skipTo.field().toCharArray()[skipTo.field().length() - 1] + 1)); // ;

    if ((!skipTo.equals(initTerm) || termPosition == 0) && termCache != null) {
        termDocFreqBuffer = termCache.subMap(skipTo, termCache.lastKey());
    } else {//from w ww  .j a  v  a  2  s  .  c o  m
        termDocFreqBuffer = null;
    }

    if (termDocFreqBuffer != null) {

        termBuffer = termDocFreqBuffer.keySet().toArray(new Term[] {});
        termPosition = 0;

        logger.debug("Found " + startTerm + " in cache");
        return;
    } else if (chunkCount > 1 && actualInitSize < maxChunkSize) {
        termBuffer = new Term[] {};
        termPosition = 0;
        return; // done!
    }

    chunkCount++;

    // The first time we grab just a few keys
    int count = maxInitSize;

    // otherwise we grab all the rest of the keys
    if (initTerm != null) {
        count = maxChunkSize;
        startTerm = indexName + HBaseUtils.delimeter + HBaseUtils.createColumnName(initTerm);
    }

    long start = System.currentTimeMillis();

    termDocFreqBuffer = new TreeMap<Term, NavigableMap<byte[], byte[]>>();

    // Get all columns
    Scan scan = new Scan(startTerm.getBytes(), endTerm.getBytes());
    scan.addFamily(HBaseUtils.termVecColumnFamily);
    ResultScanner scanner = table.getScanner(scan);

    actualInitSize = 0;
    for (Result result : scanner) {
        ++actualInitSize;
        NavigableMap<byte[], byte[]> columns = result.getFamilyMap(HBaseUtils.termVecColumnFamily);
        byte[] row = result.getRow();
        String rowString = new String(row);

        // term keys look like wikipedia/body/wiki
        String termStr = rowString
                .substring(rowString.indexOf(HBaseUtils.delimeter) + HBaseUtils.delimeter.length());
        Term term;
        try {
            term = HBaseUtils.parseTerm(termStr.getBytes("UTF-8"));
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e);
        }

        logger.debug(termStr + " has " + columns.size());

        //check for tombstone keys
        if (columns.size() > 0) {
            termDocFreqBuffer.put(term, columns);
        }

    }

    if (!termDocFreqBuffer.isEmpty()) {
        initTerm = termDocFreqBuffer.lastKey();
    }

    // term to start with next time
    logger.debug("Found " + actualInitSize + " keys in range:" + startTerm + " to " + endTerm + " in "
            + (System.currentTimeMillis() - start) + "ms");

    // add a final key (excluded in submap below)
    termDocFreqBuffer.put(finalTerm, null);

    // put in cache
    for (Term termKey : termDocFreqBuffer.keySet()) {

        if (termCache == null) {
            termCache = termDocFreqBuffer;
        } else {
            termCache.putAll(termDocFreqBuffer);
        }

        indexReader.addTermEnumCache(termKey, this);
    }

    // cache the initial term too
    indexReader.addTermEnumCache(skipTo, this);
    termBuffer = termDocFreqBuffer.keySet().toArray(new Term[] {});
    termPosition = 0;
    long end = System.currentTimeMillis();

    logger.debug("loadTerms: " + startTerm + "(" + termBuffer.length + ") took " + (end - start) + "ms");
}

From source file:com.barchart.feed.ddf.resolver.provider.CodecHelper.java

License:BSD License

/** convert instrument into lucene document */
static Document instrumentEncode(final Instrument instrument) {

    final Document doc = new Document();

    {//from w w w .  ja va  2s.co m

        final Term term = getKeyTerm(instrument);

        final String name = term.field();
        final String value = term.text();

        /** store; do not index */
        final Field keyField = new Field(name, value, Field.Store.YES, Field.Index.NOT_ANALYZED);

        doc.add(keyField);

    }

    {

        final String name = CodecHelper.FIELD_INST_BODY;
        final String value = fullText(instrument);

        /** index; do not store */
        final Field bodyField = new Field(name, value, Field.Store.NO, Field.Index.ANALYZED);

        doc.add(bodyField);

    }

    // TODO If this ever need to be used, then some hardcoding
    // for iterating over the instrument fields will need to be written.

    // Currently, this isn't being used.

    //      for (final Tag<?> field : CodecHelper.BASE) {
    //
    //         final String name = field.name();
    //         final String value = encode(field, instrument.get(field));
    //
    //         /** store; do not index */
    //         final Field baseField = new Field(name, value, Field.Store.YES,
    //               Field.Index.NO);
    //
    //         doc.add(baseField);
    //
    //      }

    return doc;

}

From source file:com.basistech.lucene.tools.LuceneQueryTool.java

License:Apache License

private void runQuery(String queryString, final PrintStream out)
        throws IOException, org.apache.lucene.queryparser.classic.ParseException {
    final IndexSearcher searcher = new IndexSearcher(indexReader);
    docsPrinted = 0;//ww w  . ja v a 2 s  .c  o m
    Query query;
    if (queryString == null) {
        query = new MatchAllDocsQuery();
    } else {
        if (!queryString.contains(":") && defaultField == null) {
            throw new RuntimeException("query has no ':' and no query-field defined");
        }
        QueryParser queryParser = new QueryParser(defaultField, analyzer);
        queryParser.setLowercaseExpandedTerms(false);
        query = queryParser.parse(queryString).rewrite(indexReader);
        Set<Term> terms = Sets.newHashSet();
        query.createWeight(searcher, false).extractTerms(terms);
        List<String> invalidFieldNames = Lists.newArrayList();
        for (Term term : terms) {
            if (!allFieldNames.contains(term.field())) {
                invalidFieldNames.add(term.field());
            }
        }
        if (!invalidFieldNames.isEmpty()) {
            throw new RuntimeException("Invalid field names: " + invalidFieldNames);
        }
    }

    final Set<String> fieldSet = Sets.newHashSet(fieldNames);

    // use a Collector instead of TopDocs for memory efficiency, especially
    // for the %all query
    class MyCollector extends SimpleCollector {
        private Scorer scorer;
        private long totalHits;
        private int docBase;

        @Override
        protected void doSetNextReader(LeafReaderContext context) throws IOException {
            docBase = context.docBase;
        }

        @Override
        public void collect(int id) throws IOException {
            totalHits++;
            if (docsPrinted >= outputLimit) {
                return;
            }

            id += docBase;
            Document doc = fieldSet.isEmpty() ? searcher.doc(id) : searcher.doc(id, fieldSet);
            boolean passedFilter = regexField == null;
            if (regexField != null) {
                String value = doc.get(regexField);
                if (value != null && regex.matcher(value).matches()) {
                    passedFilter = true;
                }
            }
            if (passedFilter) {
                float score = scorer.score();
                printDocument(doc, id, score, out);
            }
        }

        @Override
        public boolean needsScores() {
            return true;
        }

        @Override
        public void setScorer(Scorer scorer) throws IOException {
            this.scorer = scorer;
        }
    }

    MyCollector collector = new MyCollector();
    searcher.search(query, collector);
    if (showHits) {
        out.println("totalHits: " + collector.totalHits);
        out.println();
    }
}

From source file:com.bizosys.unstructured.IndexSearcher.java

License:Apache License

@Deprecated
public String searchQueryPartsFill(String indexName, String docType, String query, Analyzer analyzer,
        Map<String, String> multiQueryParts) throws Exception {

    System.err.println(/*from w  w  w  .j av a 2s  .  c om*/
            "\n\n\n************ Stop using this method and instead use the following method. ******************\n"
                    + "public String searchQueryPartsFill( Analyzer analyzer, boolean isAllWords, String multiQuery, Map<String, String> multiQueryParts, String... partsToAnalyze) throws Exception\n\n\n");

    String defaultField = "BIZOSYSNONE";

    QueryParser qp = new QueryParser(Version.LUCENE_36, defaultField, analyzer);
    Query q = qp.parse(query);
    Set<Term> terms = new HashSet<Term>();
    q.extractTerms(terms);

    int index = 0;
    Map<String, String> termsL = new HashMap<String, String>();
    if (!"*".equals(docType))
        docType = this.sConf.getDocumentTypeCodes().getCode(docType).toString();

    for (Term term : terms) {
        String fieldName = term.field();
        if (defaultField.equals(fieldName))
            fieldName = "*";
        else if ("*".equals(fieldName))
            fieldName = "*";
        else
            fieldName = this.sConf.getFieldTypeCodes().getCode(term.field()).toString();

        String fieldText = term.text();

        String expandedTerm = docType + "|" + fieldName + "|" + Hashing.hash(fieldText) + "|*|*";

        String lhs = indexName + ":" + index;
        multiQueryParts.put(lhs, expandedTerm);

        String fld = term.field();
        if (defaultField.equals(fld))
            termsL.put(fieldText, lhs);
        else
            termsL.put(term.field() + ":" + fieldText, lhs);
        index++;
    }

    //Replace the intermediate ones
    for (String term : termsL.keySet()) {

        String caseQuery = null;
        for (int i = 0; i < 3; i++) {
            switch (i) {
            case 0:
                caseQuery = query;
                break;
            case 1:
                caseQuery = query.toLowerCase();
                break;
            case 2:
                caseQuery = query.toUpperCase();
                break;
            }
            term = term.replace(defaultField + ":", "");
            int caseTermIndex = caseQuery.indexOf(term + " ");
            if (caseTermIndex >= 0) {
                query = query.substring(0, caseTermIndex) + termsL.get(term)
                        + query.substring(caseTermIndex + term.length());
            }
        }
    }

    //Replace the last one
    for (String term : termsL.keySet()) {
        String caseQuery = null;
        for (int j = 0; j < 3; j++) {
            switch (j) {
            case 0:
                caseQuery = query;
                break;
            case 1:
                caseQuery = query.toLowerCase();
                break;
            case 2:
                caseQuery = query.toUpperCase();
                break;
            }
            int caseTermIndex = caseQuery.indexOf(term);
            if (caseTermIndex >= 0) {
                query = query.substring(0, caseTermIndex) + termsL.get(term)
                        + query.substring(caseTermIndex + term.length());
                break;
            }
        }
    }

    return query;
}

From source file:com.bizosys.unstructured.IndexSearcher.java

License:Apache License

public String searchQueryPartsFill(Analyzer analyzer, boolean isAllWords, String multiQuery,
        Map<String, String> multiQueryParts, String... partsToAnalyze) throws Exception {

    String defaultField = "BIZOSYSNONE";
    Map<Integer, String> explodedParts = new HashMap<Integer, String>();

    for (String qKey : partsToAnalyze) {
        QueryParser qp = new QueryParser(Version.LUCENE_36, defaultField, analyzer);
        Set<Term> terms = new HashSet<Term>();
        Query q = qp.parse(multiQueryParts.get(qKey));
        q.extractTerms(terms);/*from   w  ww.j av  a  2 s . c  o m*/

        int index = 1;
        explodedParts.clear();

        for (Term term : terms) {
            String fieldName = term.field();
            String fieldText = term.text();
            String docType = "*";
            String fieldType = "*";
            int docAndFieldBreakPointIndex = fieldName.indexOf('/');

            if (-1 == docAndFieldBreakPointIndex) {
                docType = fieldName;
            } else {
                docType = fieldName.substring(0, docAndFieldBreakPointIndex);
                fieldType = fieldName.substring(docAndFieldBreakPointIndex + 1);
            }

            if (docType.equals(defaultField))
                docType = "*";
            else if (!("*".equals(docType) || "".equals(docType))) {
                docType = sConf.getDocumentTypeCodes().getCode(docType).toString();
            }

            if (fieldType.equals(defaultField))
                fieldType = "*";
            else if (!("*".equals(fieldType) || "".equals(fieldType))) {
                fieldType = sConf.getFieldTypeCodes().getCode(fieldType).toString();
            }

            String expandedTerm = docType + "|" + fieldType + "|" + Hashing.hash(fieldText) + "|*|*";
            explodedParts.put(index, expandedTerm);
            index++;
        }

        if (explodedParts.size() > 1) {
            multiQueryParts.remove(qKey);

            StringBuilder sb = new StringBuilder();
            boolean isFirst = true;
            for (Integer seq : explodedParts.keySet()) {
                String explodedKey = qKey + seq.toString();
                multiQueryParts.put(explodedKey, explodedParts.get(seq));
                if (isFirst)
                    isFirst = false;
                else {
                    if (isAllWords)
                        sb.append(" AND ");
                    else
                        sb.append(" OR ");
                }
                sb.append(explodedKey);
            }
            multiQuery = multiQuery.replace(qKey, " ( " + sb.toString() + " ) ");
        } else {
            multiQueryParts.put(qKey, explodedParts.get(index - 1));
        }
    }

    return multiQuery;
}

From source file:com.bizosys.unstructured.IndexSearcher.java

License:Apache License

public String searchQueryPartsFillWithMetadata(Analyzer analyzer, boolean isAllWords, String multiQuery,
        Map<String, String> multiQueryParts, String... partsToAnalyze) throws Exception {

    String defaultField = "BIZOSYSNONE";
    Map<Integer, String> explodedParts = new HashMap<Integer, String>();

    for (String qKey : partsToAnalyze) {
        QueryParser qp = new QueryParser(Version.LUCENE_36, defaultField, analyzer);
        Set<Term> terms = new HashSet<Term>();
        Query q = qp.parse(multiQueryParts.get(qKey));
        q.extractTerms(terms);/*from  ww w .j a  v  a2s .co  m*/

        int index = 1;
        explodedParts.clear();

        for (Term term : terms) {
            String fieldName = term.field();
            String searchword = term.text();
            String docType = "*";
            String fieldType = "*";
            String payload = "*";

            int docAndFieldBreakPointIndex = fieldName.indexOf('/');

            if (-1 == docAndFieldBreakPointIndex) {
                docType = fieldName;
            } else {
                docType = fieldName.substring(0, docAndFieldBreakPointIndex);
                fieldType = fieldName.substring(docAndFieldBreakPointIndex + 1);

                int fieldAndPayloadBreakPointIndex = fieldType.indexOf('/');
                if (fieldAndPayloadBreakPointIndex > 0) {
                    fieldType = fieldType.substring(0, fieldAndPayloadBreakPointIndex);
                    payload = fieldType.substring(fieldAndPayloadBreakPointIndex + 1);
                }
            }

            if (docType.equals(defaultField))
                docType = "*";
            else if (!("*".equals(docType) || "".equals(docType))) {
                docType = sConf.getDocumentTypeCodes().getCode(docType).toString();
            }

            if (fieldType.equals(defaultField))
                fieldType = "*";
            else if (!("*".equals(fieldType) || "".equals(fieldType))) {
                fieldType = sConf.getFieldTypeCodes().getCode(fieldType).toString();
            }

            String expandedTerm = docType + "|" + fieldType + "|" + payload + "|" + Hashing.hash(searchword)
                    + "|*|*";
            explodedParts.put(index, expandedTerm);
            index++;
        }

        if (explodedParts.size() > 1) {
            multiQueryParts.remove(qKey);

            StringBuilder sb = new StringBuilder();
            boolean isFirst = true;
            for (Integer seq : explodedParts.keySet()) {
                String explodedKey = qKey + seq.toString();
                multiQueryParts.put(explodedKey, explodedParts.get(seq));
                if (isFirst)
                    isFirst = false;
                else {
                    if (isAllWords)
                        sb.append(" AND ");
                    else
                        sb.append(" OR ");
                }
                sb.append(explodedKey);
            }
            multiQuery = multiQuery.replace(qKey, " ( " + sb.toString() + " ) ");
        } else {
            multiQueryParts.put(qKey, explodedParts.get(index - 1));
        }
    }

    return multiQuery;
}