Example usage for org.apache.lucene.index Term toString

List of usage examples for org.apache.lucene.index Term toString

Introduction

In this page you can find the example usage for org.apache.lucene.index Term toString.

Prototype

@Override
    public final String toString() 

Source Link

Usage

From source file:ai.castor.idf.IDFScorer.java

License:Apache License

public double calcIDF(String query, String answer, boolean analyze) throws ParseException {
    Analyzer analyzer;//  ww  w.jav  a  2  s . c  o  m
    if (analyze) {
        analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
    } else {
        analyzer = new WhitespaceAnalyzer();
    }

    QueryParser qp = new QueryParser(FIELD_BODY, analyzer);
    ClassicSimilarity similarity = new ClassicSimilarity();

    String escapedQuery = qp.escape(query);
    Query question = qp.parse(escapedQuery);
    HashSet<String> questionTerms = new HashSet<>(Arrays.asList(question.toString().trim().split("\\s+")));

    double idf = 0.0;
    HashSet<String> seenTerms = new HashSet<>();

    String[] terms = answer.split("\\s+");
    for (String term : terms) {
        try {
            TermQuery q = (TermQuery) qp.parse(term);
            Term t = q.getTerm();

            if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) {
                idf += similarity.idf(reader.docFreq(t), reader.numDocs());
                seenTerms.add(t.toString());
            } else {
                idf += 0.0;
            }
        } catch (Exception e) {
            continue;
        }
    }
    return idf;
}

From source file:br.ufrgs.inf.dsmoura.repository.controller.solr.SolrConversionUtil.java

private static String fromTermsToQuery(List<Term> terms) {
    if (terms.size() == 0) {
        throw new IllegalArgumentException("Empty terms.");
    }//from  ww  w . ja v  a2 s. com
    String query = "";
    for (Term t : terms) {
        query += t.toString() + " OR ";
    }
    return query.substring(0, query.lastIndexOf(" OR "));
}

From source file:com.zimbra.cs.index.AbstractIndexStoreTest.java

License:Open Source License

private void checkNextTerm(TermFieldEnumeration fields, Term term) {
    Assert.assertTrue("fields.hasMoreElements() value when expecting:" + term.toString(),
            fields.hasMoreElements());//w  w  w. ja  v  a  2 s .c  o  m
    BrowseTerm browseTerm = fields.nextElement();
    Assert.assertNotNull("fields.nextElement() value when expecting:" + term.toString(), browseTerm);
    ZimbraLog.test.debug("Expecting %s=%s value is %s docFreq=%d", term.field(), term.text(),
            browseTerm.getText(), browseTerm.getFreq());
    Assert.assertEquals("field value", term.text(), browseTerm.getText());
}

From source file:de.ingrid.search.utils.facet.FacetClassProducer.java

License:EUPL

public List<FacetClass> produceClasses(FacetDefinition facetDef) {
    List<FacetClass> fClasses = new ArrayList<FacetClass>();
    try {/*from   ww  w  .  j av a 2  s .  c  o m*/
        if (facetDef.getQueryFragment() == null) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Create classes from index field '" + facetDef.getField() + "'.");
            }
            // presume we have a single field definition
            TermInfo[] tis = getHighFreqTerms(MAX_NUM, facetDef.getField());
            for (TermInfo ti : tis) {
                long start = 0;
                if (LOG.isInfoEnabled()) {
                    start = System.currentTimeMillis();
                }
                fClasses.add(produceClassFromQuery(ti.term.field() + ":" + ti.term.text(),
                        getLuceneQuery(ti.term.field() + ":" + ti.term.text())));
                if (LOG.isInfoEnabled()) {
                    LOG.info("Create facet class: " + fClasses.get(fClasses.size() - 1) + " in "
                            + (System.currentTimeMillis() - start) + " ms.");
                }
            }
        } else {

            // we have a query fragment
            Query query = getLuceneQuery(facetDef.getQueryFragment());
            OpenBitSet[] bitSets = FacetUtils.getBitSetsFromQuery(query, indexReaderWrapper);
            Map<Term, Integer> tiq = new HashMap<Term, Integer>();
            for (int i = 0; i < bitSets.length; i++) {
                IndexReader indexReader = indexReaderWrapper.getIndexReader()[i];
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Read terms from field '" + facetDef.getField() + "' for bitset " + i);
                }
                TermEnum termEnum = indexReader.terms(new Term(facetDef.getField(), ""));
                // iterate through all the values of this facet and see look
                // at number of hits per term
                try {
                    TermDocs termDocs = indexReader.termDocs();
                    // open termDocs only once, and use seek: this is more
                    // efficient
                    try {
                        do {
                            Term term = termEnum.term();
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Term found: '" + term.toString() + "' [term-field: " + term.field()
                                        + ", facet-field:" + facetDef.getField() + "].");
                            }
                            int count = 0;
                            int minFreq = 0;
                            if (term != null && term.field().equals(facetDef.getField())) {
                                termDocs.seek(term);
                                while (termDocs.next()) {
                                    if (bitSets[i].get(termDocs.doc())) {
                                        count++;
                                    }
                                }
                                if (LOG.isDebugEnabled()) {
                                    LOG.debug("Occurence found:" + count);
                                }
                                if (count > 0) {
                                    if (!"".equals(term.text())) {
                                        if (count > minFreq) {
                                            tiq.put(term, count);
                                            if (tiq.size() > MAX_NUM) // if
                                            // tiq
                                            // overfull
                                            {
                                                // find and remove minimal
                                                // term to ensure capacity
                                                // of
                                                Term minTerm = null;
                                                for (Term t : tiq.keySet()) {
                                                    if (minTerm == null) {
                                                        minFreq = tiq.get(t);
                                                        minTerm = t;
                                                    }
                                                    if (minFreq > tiq.get(t)) {
                                                        minFreq = tiq.get(t);
                                                        minTerm = t;
                                                    }
                                                }
                                                tiq.remove(minTerm);
                                            }
                                        }
                                    }
                                }

                            } else {
                                break;
                            }
                        } while (termEnum.next());
                    } finally {
                        termDocs.close();
                    }
                } finally {
                    termEnum.close();
                }

                TermInfo[] res = new TermInfo[tiq.size()];
                int cnt = 0;
                for (Term t : tiq.keySet()) {
                    res[cnt] = new TermInfo(t, tiq.get(t));
                    cnt++;
                }
                Arrays.sort(res, new TermInfoComparator());

                for (TermInfo ti : res) {
                    long start = 0;
                    if (LOG.isInfoEnabled()) {
                        start = System.currentTimeMillis();
                    }
                    fClasses.add(produceClassFromQuery(ti.term.field() + ":" + ti.term.text(), getLuceneQuery(
                            facetDef.getQueryFragment() + " " + ti.term.field() + ":" + ti.term.text())));
                    if (LOG.isInfoEnabled()) {
                        LOG.info("Create facet class: " + fClasses.get(fClasses.size() - 1) + " in "
                                + (System.currentTimeMillis() - start) + " ms.");
                    }
                }

            }

        }

    } catch (ParseException e) {
        LOG.error("Error producing facet classes from facet '" + facetDef.getName() + "'.", e);
    } catch (Exception e) {
        LOG.error("Error producing facet classes from facet '" + facetDef.getName() + "'.", e);
    }
    return fClasses;
}

From source file:io.anserini.qa.passage.IdfPassageScorer.java

License:Apache License

@Override
public void score(String query, Map<String, Float> sentences) throws Exception {
    //    EnglishAnalyzer ea = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords));
    EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET);
    QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea);
    ClassicSimilarity similarity = new ClassicSimilarity();

    String escapedQuery = qp.escape(query);
    Query question = qp.parse(escapedQuery);
    HashSet<String> questionTerms = new HashSet<>(
            Arrays.asList(question.toString().trim().toLowerCase().split("\\s+")));

    // add the question terms to the termIDF Map
    for (String questionTerm : questionTerms) {
        try {/*from   w w w . j  av  a2s  .c  o m*/
            TermQuery q = (TermQuery) qp.parse(questionTerm);
            Term t = q.getTerm();

            double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
            termIdfMap.put(questionTerm, String.valueOf(termIDF));
        } catch (Exception e) {
            continue;
        }
    }

    // avoid duplicate passages
    HashSet<String> seenSentences = new HashSet<>();

    for (Map.Entry<String, Float> sent : sentences.entrySet()) {
        double idf = 0.0;
        HashSet<String> seenTerms = new HashSet<>();

        String[] terms = sent.getKey().toLowerCase().split("\\s+");
        for (String term : terms) {
            try {
                TermQuery q = (TermQuery) qp.parse(term);
                Term t = q.getTerm();
                double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs());
                termIdfMap.put(term, String.valueOf(termIDF));

                if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) {
                    idf += termIDF;
                    seenTerms.add(t.toString());
                } else {
                    idf += 0.0;
                }
            } catch (Exception e) {
                continue;
            }
        }

        double weightedScore = idf + 0.0001 * sent.getValue();
        ScoredPassage scoredPassage = new ScoredPassage(sent.getKey(), weightedScore, sent.getValue());
        if ((scoredPassageHeap.size() < topPassages || weightedScore > scoredPassageHeap.peekLast().getScore())
                && !seenSentences.contains(sent)) {
            if (scoredPassageHeap.size() == topPassages) {
                scoredPassageHeap.pollLast();
            }
            scoredPassageHeap.add(scoredPassage);
            seenSentences.add(sent.getKey());
        }
    }
}

From source file:newseman.TestSemanticTaggerIndexing.java

License:Apache License

public void testSemanticTokenFilter() throws IOException, ParseException {
    String text = "velk svtov revoluce byla velk jnov revoluce "
            + "s velkou extra jnovou revoluc";

    Directory ramdir = new RAMDirectory();
    Analyzer analyzer = new TestSemanticAnalyzer();
    IndexWriter writer = new IndexWriter(ramdir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
    Document doc = new Document();
    Field field1 = newField("foo", text, TextField.TYPE_STORED);
    Field field2 = newField("foox", text, TextField.TYPE_STORED);

    doc.add(field1);/*from   w  w  w.  j  a v  a 2s.  co m*/
    doc.add(field2);
    writer.addDocument(doc);
    writer.close();

    IndexSearcher ram = new IndexSearcher(DirectoryReader.open(ramdir));
    QueryParser qp1 = new QueryParser(TEST_VERSION_CURRENT, "foo", analyzer);
    QueryParser qp2 = new QueryParser(TEST_VERSION_CURRENT, "foox", analyzer);

    TopDocs hits;

    hits = ram.search(qp1.parse("foo:XXX"), 10);
    assertTrue(hits.totalHits == 1);

    hits = ram.search(qp1.parse("foox:XXX"), 10);
    assertTrue(hits.totalHits == 0);

    // currently, each token is tokenized by the qparser
    // so we don't see them together
    Query q1 = qp1.parse("\"velk jnov revoluce\"");
    Query q2 = qp2.parse("\"velk jnov revoluce\"");

    assertTrue(!q1.equals(q2));

    assertTrue(q1 instanceof MultiPhraseQuery);
    assertTrue(q2 instanceof PhraseQuery);

    MultiPhraseQuery mq = (MultiPhraseQuery) q1;
    List<Term[]> ta = mq.getTermArrays();
    StringBuffer o = new StringBuffer();
    for (int i = 0; i < ta.size(); i++) {
        for (Term t : ta.get(i)) {
            o.append(t.toString());
            o.append(" ");
        }
        o.append("|");
    }
    assertTrue(o.toString()
            .equals("foo:velk foo:velk jnov revoluce foo:XXX |foo:jnov |foo:revoluce |"));

    assertTrue(q1.toString().equals("foo:\"(velk velk jnov revoluce XXX) jnov revoluce\""));
    assertTrue(q2.toString().equals("foox:\"velk jnov revoluce\""));

    Set<Term> terms = new HashSet<Term>();
    q1.extractTerms(terms);

    // extract only the 2nd (semantic) element
    q1 = qp1.parse("revoluce");

    terms = new HashSet<Term>();
    q1.extractTerms(terms);

    Term semQ = (Term) terms.toArray()[1];
    String sem = semQ.text();

    hits = ram.search(qp1.parse(sem), 10);
    assertTrue(hits.totalHits == 1);
    hits = ram.search(qp1.parse(semQ.toString()), 10);
    assertTrue(hits.totalHits == 1);
    hits = ram.search(qp2.parse(sem), 10);
    assertTrue(hits.totalHits == 0);

    ramdir.close();
}

From source file:org.apache.blur.manager.AliasBlurFilterCache.java

License:Apache License

private Filter buildNewFilter(Query query, ConcurrentMap<String, String> filterAlias, FilterParser filterParser)
        throws ParseException {
    if (query instanceof BooleanQuery) {
        BooleanQuery booleanQuery = (BooleanQuery) query;
        BooleanFilter booleanFilter = new BooleanFilter();
        for (BooleanClause clause : booleanQuery.clauses()) {
            booleanFilter.add(buildNewFilter(clause.getQuery(), filterAlias, filterParser), clause.getOccur());
        }/* w  w  w. j  a  v  a  2s. co m*/
        return booleanFilter;
    } else if (query instanceof TermQuery) {
        TermQuery termQuery = (TermQuery) query;
        Term term = termQuery.getTerm();
        String key = term.toString();
        String queryStr = filterAlias.get(key);
        if (queryStr == null) {
            return new QueryWrapperFilter(termQuery);
        }
        String id = getId(key);
        return new FilterCache(id, new QueryWrapperFilter(filterParser.parse(queryStr)));
    } else {
        return new QueryWrapperFilter(query);
    }
}

From source file:org.apache.jetspeed.services.search.lucene.LuceneSearchService.java

License:Apache License

/**
 * // w w w . j  av a2s .  c om
 * @see org.apache.jetspeed.services.search.SearchService#remove(java.lang.Collection)
 * @param c
 * @return 
 */
public boolean remove(Collection c) {
    boolean result = false;

    try {
        IndexReader indexReader = IndexReader.open(this.rootDir);

        Iterator it = c.iterator();
        while (it.hasNext()) {
            Object o = it.next();
            // Look up appropriate handler
            ObjectHandler handler = HandlerFactory.getHandler(o);

            // Parse the object
            ParsedObject parsedObject = handler.parseObject(o);

            // Create term
            Term term = null;

            if (parsedObject.getKey() != null) {
                term = new Term(ParsedObject.FIELDNAME_KEY, parsedObject.getKey());
                // Remove the document from search index
                int rc = indexReader.delete(term);
                logger.info(
                        "Attempted to delete '" + term.toString() + "' from index, documents deleted = " + rc);
                //System.out.println("Attempted to delete '" + term.toString() + "' from index, documents deleted = " + rc);
                result = rc > 0;
            }
        }

        indexReader.close();

        IndexWriter indexWriter = new IndexWriter(rootDir, new StandardAnalyzer(), false);
        indexWriter.optimize();
        indexWriter.close();

    } catch (Exception e) {
        logger.error("Exception", e);
        result = false;
    }

    return result;
}

From source file:org.apache.solr.search.stats.ExactStatsCache.java

License:Apache License

@Override
public void returnLocalStats(ResponseBuilder rb, SolrIndexSearcher searcher) {
    Query q = rb.getQuery();/*ww  w.ja va 2  s . c  o m*/
    try {
        q = q.rewrite(searcher.getIndexReader());
        HashSet<Term> terms = new HashSet<Term>();
        q.extractTerms(terms);
        IndexReaderContext context = searcher.getTopReaderContext();
        HashMap<String, TermStats> statsMap = new HashMap<String, TermStats>();
        HashMap<String, CollectionStats> colMap = new HashMap<String, CollectionStats>();
        for (Term t : terms) {
            TermContext termContext = TermContext.build(context, t);

            TermStatistics tst = searcher.localTermStatistics(t, termContext);
            if (tst.docFreq() == 0) { // skip terms that are not present here
                continue;
            }

            statsMap.put(t.toString(), new TermStats(t.field(), tst));
            rb.rsp.add(TERMS_KEY, t.toString());
            if (!colMap.containsKey(t.field())) { // collection stats for this field
                colMap.put(t.field(), new CollectionStats(searcher.localCollectionStatistics(t.field())));
            }
        }

        String termStatsString = StatsUtil.termStatsMapToString(statsMap);
        rb.rsp.add(TERM_STATS_KEY, termStatsString);
        String colStatsString = StatsUtil.colStatsMapToString(colMap);
        rb.rsp.add(COL_STATS_KEY, colStatsString);
        if (LOG.isDebugEnabled()) {
            LOG.debug("termStats=" + termStatsString + ", collectionStats=" + colStatsString + ", terms="
                    + terms + ", numDocs=" + searcher.maxDoc());
        }
    } catch (IOException e) {
        LOG.error("Error collecting local stats, query='" + q.toString() + "'", e);
        throw new SolrException(ErrorCode.SERVER_ERROR, "Error collecting local stats.", e);
    }
}

From source file:org.apache.solr.search.stats.StatsUtil.java

License:Apache License

private static TermStats termStatsFromString(String data, Term t) {
    if (data == null || data.trim().length() == 0) {
        LOG.warn("Invalid empty term stats string");
        return null;
    }/*from ww  w .j a  v  a 2s  .  c  o  m*/
    String[] vals = data.split(",");
    if (vals.length < 2) {
        LOG.warn("Invalid term stats string, num fields " + vals.length + " < 2, '" + data + "'");
        return null;
    }
    Term termToUse;
    int idx = 0;
    if (vals.length == 3) {
        idx++;
        // with term
        Term term = termFromString(vals[0]);
        if (term != null) {
            termToUse = term;
            if (t != null) {
                assert term.equals(t);
            }
        } else { // failed term decoding
            termToUse = t;
        }
    } else {
        termToUse = t;
    }
    if (termToUse == null) {
        LOG.warn("Missing term in termStats '" + data + "'");
        return null;
    }
    try {
        long docFreq = Long.parseLong(vals[idx++]);
        long totalTermFreq = Long.parseLong(vals[idx]);
        return new TermStats(termToUse.toString(), docFreq, totalTermFreq);
    } catch (Exception e) {
        LOG.warn("Invalid termStats string '" + data + "'");
        return null;
    }
}