Example usage for org.apache.lucene.index Term text

List of usage examples for org.apache.lucene.index Term text

Introduction

In this page you can find the example usage for org.apache.lucene.index Term text.

Prototype

public final String text() 

Source Link

Document

Returns the text of this term.

Usage

From source file:RangeFilter.java

License:Apache License

/**
 * Returns a BitSet with true for documents which should be
 * permitted in search results, and false for those that should
 * not.//ww  w .j  a  va2 s . co  m
 */
public BitSet bits(IndexReader reader) throws IOException {
    BitSet bits = new BitSet(reader.maxDoc());
    TermEnum enumerator = (null != lowerTerm ? reader.terms(new Term(fieldName, lowerTerm))
            : reader.terms(new Term(fieldName, "")));

    try {

        if (enumerator.term() == null) {
            return bits;
        }

        boolean checkLower = false;
        if (!includeLower) // make adjustments to set to exclusive
            checkLower = true;

        TermDocs termDocs = reader.termDocs();
        try {

            do {
                Term term = enumerator.term();
                if (term != null && term.field().equals(fieldName)) {
                    if (!checkLower || null == lowerTerm || term.text().compareTo(lowerTerm) > 0) {
                        checkLower = false;
                        if (upperTerm != null) {
                            int compare = upperTerm.compareTo(term.text());
                            /* if beyond the upper term, or is exclusive and
                             * this is equal to the upper term, break out */
                            if ((compare < 0) || (!includeUpper && compare == 0)) {
                                break;
                            }
                        }
                        /* we have a good term, find the docs */

                        termDocs.seek(enumerator.term());
                        while (termDocs.next()) {
                            bits.set(termDocs.doc());
                        }
                    }
                } else {
                    break;
                }
            } while (enumerator.next());

        } finally {
            termDocs.close();
        }
    } finally {
        enumerator.close();
    }

    return bits;
}

From source file:CountWords3.java

License:Apache License

public static void main(String[] args) throws Exception {

    if (args.length != 1) {
        String usage = "Usage:\tjava CountWords3 indexdir";
        System.out.println(usage);
        System.exit(0);/*from  ww  w.  ja va  2s .  c  o m*/
    }

    String index = args[0];
    String field = "contents";
    String queries = null;

    IndexReader reader = IndexReader.open(FSDirectory.open(new File(index)));
    TermEnum te = reader.terms();

    int totalOccs = 0;
    int i = 0;
    boolean notlastt = te.next();
    Map<String, Integer> terms = new HashMap<>();

    if (removeStopWords) {
        File stopWordsFile = new File("./src/org/apache/lucene/demo/stopwords_long_EN.txt");
        setStopWords(stopWordsFile);
    }

    while (notlastt) {
        Term t = te.term();
        if (t.field().equals(field)) { // ignore if not desired field
            if (!removeTerm(t.text())) {
                if (!removeStopWords || !stopWords.contains(t.text())) {
                    TermDocs td = reader.termDocs(t);
                    int n = 0;
                    boolean notlastd = td.next();
                    while (notlastd) {
                        n += td.freq();
                        notlastd = td.next();
                    }
                    String text = t.text();
                    //String text = removePunctuation(t.text());
                    if (useStemmer)
                        text = porterStemming(text);
                    Integer value;
                    if (terms.containsKey(text)) {
                        value = n + terms.get(text);
                    } else {
                        value = n;
                    }
                    terms.put(text, value);
                    //System.out.println(t.text() + " " + n);
                    totalOccs += n;
                    ++i;
                }
            }
        }
        notlastt = te.next();
    }
    terms = sortByComparator(terms, false);
    int j = 1;
    /*for (Map.Entry entry : terms.entrySet()) {
    System.out.print(entry.getKey() + "   ");
    System.out.print(entry.getValue() + "   ");
    System.out.println(j);
    j += 1;
    }*/
    System.out.println("Distinct words: " + terms.size() + "; Word occurrences: " + totalOccs);
}

From source file:au.edu.unimelb.csse.exp.GenerateQueries.java

License:Apache License

private void getAllTerms() throws IOException {
    final TermEnum terms = reader.getIndexReader().terms();
    boolean next = terms.next();
    while (next != false) {
        Term term = terms.term();
        if (reader.docFreq(term) > MIN_DOC_FREQ) {
            String original = term.text().trim();
            if (original.equals("S")) {
                textLabels.add(original);
                next = terms.next();//  w w w  .  j a  v  a  2  s. co m
                continue;
            }
            if (original.length() < 2) {
                next = terms.next();
                continue;
            }
            if (original.length() == 2) {
                if (original.toUpperCase().equals(original)) {
                    textLabels.add(original);
                } else {
                    next = terms.next();
                    continue;
                }
            }
            int uppercases = 0;
            for (int i = 0; i < original.length() && uppercases <= original.length() / 2; i++) {
                if (Character.isUpperCase(original.charAt(i))) {
                    uppercases++;
                }
            }
            if (uppercases > original.length() / 2) {
                textLabels.add(original);
            }
        }
        next = terms.next();
    }
}

From source file:au.edu.unimelb.csse.IndexUtils.java

License:Apache License

public int findDocFreq(String term) throws IOException {
    TermEnum terms = reader.terms();/*w  ww.j  av a  2s. co m*/
    boolean next = terms.next();
    while (next) {
        Term t = terms.term();
        if (t.text().equals(term)) {
            return reader.docFreq(t);
        }
        next = terms.next();
    }
    return 0;
}

From source file:br.ufrgs.inf.dsmoura.repository.controller.solr.SolrConversionUtil.java

private static String fromTermsToQueryText(List<Term> terms) {
    if (terms.size() == 0) {
        throw new IllegalArgumentException("Empty terms.");
    }/*from  w  ww  .j  a  va2s .  co  m*/
    String query = "";

    for (Term t : terms) {
        query += t.text();
        float boost = SolrField.getFieldByName(t.field()).getBoost();
        if (boost != SolrFieldBoost.DEFAULT_BOOST) {
            query += "^" + boost;
        }
        query += " OR ";
    }
    return query.substring(0, query.lastIndexOf(" OR "));
}

From source file:com.barchart.feed.ddf.resolver.provider.CodecHelper.java

License:BSD License

/** convert instrument into lucene document */
static Document instrumentEncode(final Instrument instrument) {

    final Document doc = new Document();

    {// ww w .j  a  v a  2s  .co  m

        final Term term = getKeyTerm(instrument);

        final String name = term.field();
        final String value = term.text();

        /** store; do not index */
        final Field keyField = new Field(name, value, Field.Store.YES, Field.Index.NOT_ANALYZED);

        doc.add(keyField);

    }

    {

        final String name = CodecHelper.FIELD_INST_BODY;
        final String value = fullText(instrument);

        /** index; do not store */
        final Field bodyField = new Field(name, value, Field.Store.NO, Field.Index.ANALYZED);

        doc.add(bodyField);

    }

    // TODO If this ever need to be used, then some hardcoding
    // for iterating over the instrument fields will need to be written.

    // Currently, this isn't being used.

    //      for (final Tag<?> field : CodecHelper.BASE) {
    //
    //         final String name = field.name();
    //         final String value = encode(field, instrument.get(field));
    //
    //         /** store; do not index */
    //         final Field baseField = new Field(name, value, Field.Store.YES,
    //               Field.Index.NO);
    //
    //         doc.add(baseField);
    //
    //      }

    return doc;

}

From source file:com.bizosys.hsearch.kv.impl.KVDocIndexer.java

License:Apache License

public String parseQuery(Analyzer analyzer, String docType, String fieldType, String query)
        throws IOException, ParseException, InstantiationException {

    String docTypeCode = "*".equals(docType) ? "*"
            : new Integer(DocumentTypeCodes.getInstance().getCode(docType)).toString();

    String fldTypeCode = "*".equals(fieldType) ? "*"
            : new Integer(FieldTypeCodes.getInstance().getCode(fieldType)).toString();

    QueryParser qp = new QueryParser(Version.LUCENE_36, "K", analyzer);
    Query q = null;/*  w  w w  . j  a  v  a  2s .c o m*/
    try {
        q = qp.parse(query);
    } catch (org.apache.lucene.queryParser.ParseException ex) {
        throw new ParseException(ex.getMessage(), 0);
    }
    Set<Term> terms = new HashSet<Term>();
    q.extractTerms(terms);

    StringBuilder allWords = null;
    for (Term term : terms) {
        String fieldText = term.text();
        if (null == allWords) {
            allWords = new StringBuilder("{");
            allWords.append(Hashing.hash(fieldText));
        } else {
            allWords.append(',').append(Hashing.hash(fieldText));
        }
    }
    allWords.append('}');

    StringBuilder queryBuilder = new StringBuilder(1024);
    queryBuilder.append(docTypeCode);
    queryBuilder.append('|');
    queryBuilder.append(fldTypeCode);
    queryBuilder.append('|');
    queryBuilder.append('*');
    queryBuilder.append('|');
    queryBuilder.append(allWords.toString());
    queryBuilder.append("|*|*");

    return queryBuilder.toString();
}

From source file:com.bizosys.unstructured.IndexSearcher.java

License:Apache License

@Deprecated
public String searchQueryPartsFill(String indexName, String docType, String query, Analyzer analyzer,
        Map<String, String> multiQueryParts) throws Exception {

    System.err.println(/*from   w  w  w. j a  va2 s.c o  m*/
            "\n\n\n************ Stop using this method and instead use the following method. ******************\n"
                    + "public String searchQueryPartsFill( Analyzer analyzer, boolean isAllWords, String multiQuery, Map<String, String> multiQueryParts, String... partsToAnalyze) throws Exception\n\n\n");

    String defaultField = "BIZOSYSNONE";

    QueryParser qp = new QueryParser(Version.LUCENE_36, defaultField, analyzer);
    Query q = qp.parse(query);
    Set<Term> terms = new HashSet<Term>();
    q.extractTerms(terms);

    int index = 0;
    Map<String, String> termsL = new HashMap<String, String>();
    if (!"*".equals(docType))
        docType = this.sConf.getDocumentTypeCodes().getCode(docType).toString();

    for (Term term : terms) {
        String fieldName = term.field();
        if (defaultField.equals(fieldName))
            fieldName = "*";
        else if ("*".equals(fieldName))
            fieldName = "*";
        else
            fieldName = this.sConf.getFieldTypeCodes().getCode(term.field()).toString();

        String fieldText = term.text();

        String expandedTerm = docType + "|" + fieldName + "|" + Hashing.hash(fieldText) + "|*|*";

        String lhs = indexName + ":" + index;
        multiQueryParts.put(lhs, expandedTerm);

        String fld = term.field();
        if (defaultField.equals(fld))
            termsL.put(fieldText, lhs);
        else
            termsL.put(term.field() + ":" + fieldText, lhs);
        index++;
    }

    //Replace the intermediate ones
    for (String term : termsL.keySet()) {

        String caseQuery = null;
        for (int i = 0; i < 3; i++) {
            switch (i) {
            case 0:
                caseQuery = query;
                break;
            case 1:
                caseQuery = query.toLowerCase();
                break;
            case 2:
                caseQuery = query.toUpperCase();
                break;
            }
            term = term.replace(defaultField + ":", "");
            int caseTermIndex = caseQuery.indexOf(term + " ");
            if (caseTermIndex >= 0) {
                query = query.substring(0, caseTermIndex) + termsL.get(term)
                        + query.substring(caseTermIndex + term.length());
            }
        }
    }

    //Replace the last one
    for (String term : termsL.keySet()) {
        String caseQuery = null;
        for (int j = 0; j < 3; j++) {
            switch (j) {
            case 0:
                caseQuery = query;
                break;
            case 1:
                caseQuery = query.toLowerCase();
                break;
            case 2:
                caseQuery = query.toUpperCase();
                break;
            }
            int caseTermIndex = caseQuery.indexOf(term);
            if (caseTermIndex >= 0) {
                query = query.substring(0, caseTermIndex) + termsL.get(term)
                        + query.substring(caseTermIndex + term.length());
                break;
            }
        }
    }

    return query;
}

From source file:com.bizosys.unstructured.IndexSearcher.java

License:Apache License

public String searchQueryPartsFill(Analyzer analyzer, boolean isAllWords, String multiQuery,
        Map<String, String> multiQueryParts, String... partsToAnalyze) throws Exception {

    String defaultField = "BIZOSYSNONE";
    Map<Integer, String> explodedParts = new HashMap<Integer, String>();

    for (String qKey : partsToAnalyze) {
        QueryParser qp = new QueryParser(Version.LUCENE_36, defaultField, analyzer);
        Set<Term> terms = new HashSet<Term>();
        Query q = qp.parse(multiQueryParts.get(qKey));
        q.extractTerms(terms);//from ww  w  .  java2  s .  co  m

        int index = 1;
        explodedParts.clear();

        for (Term term : terms) {
            String fieldName = term.field();
            String fieldText = term.text();
            String docType = "*";
            String fieldType = "*";
            int docAndFieldBreakPointIndex = fieldName.indexOf('/');

            if (-1 == docAndFieldBreakPointIndex) {
                docType = fieldName;
            } else {
                docType = fieldName.substring(0, docAndFieldBreakPointIndex);
                fieldType = fieldName.substring(docAndFieldBreakPointIndex + 1);
            }

            if (docType.equals(defaultField))
                docType = "*";
            else if (!("*".equals(docType) || "".equals(docType))) {
                docType = sConf.getDocumentTypeCodes().getCode(docType).toString();
            }

            if (fieldType.equals(defaultField))
                fieldType = "*";
            else if (!("*".equals(fieldType) || "".equals(fieldType))) {
                fieldType = sConf.getFieldTypeCodes().getCode(fieldType).toString();
            }

            String expandedTerm = docType + "|" + fieldType + "|" + Hashing.hash(fieldText) + "|*|*";
            explodedParts.put(index, expandedTerm);
            index++;
        }

        if (explodedParts.size() > 1) {
            multiQueryParts.remove(qKey);

            StringBuilder sb = new StringBuilder();
            boolean isFirst = true;
            for (Integer seq : explodedParts.keySet()) {
                String explodedKey = qKey + seq.toString();
                multiQueryParts.put(explodedKey, explodedParts.get(seq));
                if (isFirst)
                    isFirst = false;
                else {
                    if (isAllWords)
                        sb.append(" AND ");
                    else
                        sb.append(" OR ");
                }
                sb.append(explodedKey);
            }
            multiQuery = multiQuery.replace(qKey, " ( " + sb.toString() + " ) ");
        } else {
            multiQueryParts.put(qKey, explodedParts.get(index - 1));
        }
    }

    return multiQuery;
}

From source file:com.bizosys.unstructured.IndexSearcher.java

License:Apache License

public String searchQueryPartsFillWithMetadata(Analyzer analyzer, boolean isAllWords, String multiQuery,
        Map<String, String> multiQueryParts, String... partsToAnalyze) throws Exception {

    String defaultField = "BIZOSYSNONE";
    Map<Integer, String> explodedParts = new HashMap<Integer, String>();

    for (String qKey : partsToAnalyze) {
        QueryParser qp = new QueryParser(Version.LUCENE_36, defaultField, analyzer);
        Set<Term> terms = new HashSet<Term>();
        Query q = qp.parse(multiQueryParts.get(qKey));
        q.extractTerms(terms);/*from   ww w  .  j a  va  2 s . c om*/

        int index = 1;
        explodedParts.clear();

        for (Term term : terms) {
            String fieldName = term.field();
            String searchword = term.text();
            String docType = "*";
            String fieldType = "*";
            String payload = "*";

            int docAndFieldBreakPointIndex = fieldName.indexOf('/');

            if (-1 == docAndFieldBreakPointIndex) {
                docType = fieldName;
            } else {
                docType = fieldName.substring(0, docAndFieldBreakPointIndex);
                fieldType = fieldName.substring(docAndFieldBreakPointIndex + 1);

                int fieldAndPayloadBreakPointIndex = fieldType.indexOf('/');
                if (fieldAndPayloadBreakPointIndex > 0) {
                    fieldType = fieldType.substring(0, fieldAndPayloadBreakPointIndex);
                    payload = fieldType.substring(fieldAndPayloadBreakPointIndex + 1);
                }
            }

            if (docType.equals(defaultField))
                docType = "*";
            else if (!("*".equals(docType) || "".equals(docType))) {
                docType = sConf.getDocumentTypeCodes().getCode(docType).toString();
            }

            if (fieldType.equals(defaultField))
                fieldType = "*";
            else if (!("*".equals(fieldType) || "".equals(fieldType))) {
                fieldType = sConf.getFieldTypeCodes().getCode(fieldType).toString();
            }

            String expandedTerm = docType + "|" + fieldType + "|" + payload + "|" + Hashing.hash(searchword)
                    + "|*|*";
            explodedParts.put(index, expandedTerm);
            index++;
        }

        if (explodedParts.size() > 1) {
            multiQueryParts.remove(qKey);

            StringBuilder sb = new StringBuilder();
            boolean isFirst = true;
            for (Integer seq : explodedParts.keySet()) {
                String explodedKey = qKey + seq.toString();
                multiQueryParts.put(explodedKey, explodedParts.get(seq));
                if (isFirst)
                    isFirst = false;
                else {
                    if (isAllWords)
                        sb.append(" AND ");
                    else
                        sb.append(" OR ");
                }
                sb.append(explodedKey);
            }
            multiQuery = multiQuery.replace(qKey, " ( " + sb.toString() + " ) ");
        } else {
            multiQueryParts.put(qKey, explodedParts.get(index - 1));
        }
    }

    return multiQuery;
}