List of usage examples for org.apache.lucene.index Term toString
@Override
public final String toString()
From source file:ai.castor.idf.IDFScorer.java
License:Apache License
public double calcIDF(String query, String answer, boolean analyze) throws ParseException { Analyzer analyzer;// ww w.jav a 2 s . c o m if (analyze) { analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords)); } else { analyzer = new WhitespaceAnalyzer(); } QueryParser qp = new QueryParser(FIELD_BODY, analyzer); ClassicSimilarity similarity = new ClassicSimilarity(); String escapedQuery = qp.escape(query); Query question = qp.parse(escapedQuery); HashSet<String> questionTerms = new HashSet<>(Arrays.asList(question.toString().trim().split("\\s+"))); double idf = 0.0; HashSet<String> seenTerms = new HashSet<>(); String[] terms = answer.split("\\s+"); for (String term : terms) { try { TermQuery q = (TermQuery) qp.parse(term); Term t = q.getTerm(); if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) { idf += similarity.idf(reader.docFreq(t), reader.numDocs()); seenTerms.add(t.toString()); } else { idf += 0.0; } } catch (Exception e) { continue; } } return idf; }
From source file:br.ufrgs.inf.dsmoura.repository.controller.solr.SolrConversionUtil.java
private static String fromTermsToQuery(List<Term> terms) { if (terms.size() == 0) { throw new IllegalArgumentException("Empty terms."); }//from ww w . ja v a2 s. com String query = ""; for (Term t : terms) { query += t.toString() + " OR "; } return query.substring(0, query.lastIndexOf(" OR ")); }
From source file:com.zimbra.cs.index.AbstractIndexStoreTest.java
License:Open Source License
private void checkNextTerm(TermFieldEnumeration fields, Term term) { Assert.assertTrue("fields.hasMoreElements() value when expecting:" + term.toString(), fields.hasMoreElements());//w w w. ja v a 2 s .c o m BrowseTerm browseTerm = fields.nextElement(); Assert.assertNotNull("fields.nextElement() value when expecting:" + term.toString(), browseTerm); ZimbraLog.test.debug("Expecting %s=%s value is %s docFreq=%d", term.field(), term.text(), browseTerm.getText(), browseTerm.getFreq()); Assert.assertEquals("field value", term.text(), browseTerm.getText()); }
From source file:de.ingrid.search.utils.facet.FacetClassProducer.java
License:EUPL
public List<FacetClass> produceClasses(FacetDefinition facetDef) { List<FacetClass> fClasses = new ArrayList<FacetClass>(); try {/*from ww w . j av a 2 s . c o m*/ if (facetDef.getQueryFragment() == null) { if (LOG.isDebugEnabled()) { LOG.debug("Create classes from index field '" + facetDef.getField() + "'."); } // presume we have a single field definition TermInfo[] tis = getHighFreqTerms(MAX_NUM, facetDef.getField()); for (TermInfo ti : tis) { long start = 0; if (LOG.isInfoEnabled()) { start = System.currentTimeMillis(); } fClasses.add(produceClassFromQuery(ti.term.field() + ":" + ti.term.text(), getLuceneQuery(ti.term.field() + ":" + ti.term.text()))); if (LOG.isInfoEnabled()) { LOG.info("Create facet class: " + fClasses.get(fClasses.size() - 1) + " in " + (System.currentTimeMillis() - start) + " ms."); } } } else { // we have a query fragment Query query = getLuceneQuery(facetDef.getQueryFragment()); OpenBitSet[] bitSets = FacetUtils.getBitSetsFromQuery(query, indexReaderWrapper); Map<Term, Integer> tiq = new HashMap<Term, Integer>(); for (int i = 0; i < bitSets.length; i++) { IndexReader indexReader = indexReaderWrapper.getIndexReader()[i]; if (LOG.isDebugEnabled()) { LOG.debug("Read terms from field '" + facetDef.getField() + "' for bitset " + i); } TermEnum termEnum = indexReader.terms(new Term(facetDef.getField(), "")); // iterate through all the values of this facet and see look // at number of hits per term try { TermDocs termDocs = indexReader.termDocs(); // open termDocs only once, and use seek: this is more // efficient try { do { Term term = termEnum.term(); if (LOG.isDebugEnabled()) { LOG.debug("Term found: '" + term.toString() + "' [term-field: " + term.field() + ", facet-field:" + facetDef.getField() + "]."); } int count = 0; int minFreq = 0; if (term != null && term.field().equals(facetDef.getField())) { termDocs.seek(term); while (termDocs.next()) { if (bitSets[i].get(termDocs.doc())) { count++; } } if (LOG.isDebugEnabled()) { LOG.debug("Occurence found:" + count); } if (count > 0) { if (!"".equals(term.text())) { if (count > minFreq) { tiq.put(term, count); if (tiq.size() > MAX_NUM) // if // tiq // overfull { // find and remove minimal // term to ensure capacity // of Term minTerm = null; for (Term t : tiq.keySet()) { if (minTerm == null) { minFreq = tiq.get(t); minTerm = t; } if (minFreq > tiq.get(t)) { minFreq = tiq.get(t); minTerm = t; } } tiq.remove(minTerm); } } } } } else { break; } } while (termEnum.next()); } finally { termDocs.close(); } } finally { termEnum.close(); } TermInfo[] res = new TermInfo[tiq.size()]; int cnt = 0; for (Term t : tiq.keySet()) { res[cnt] = new TermInfo(t, tiq.get(t)); cnt++; } Arrays.sort(res, new TermInfoComparator()); for (TermInfo ti : res) { long start = 0; if (LOG.isInfoEnabled()) { start = System.currentTimeMillis(); } fClasses.add(produceClassFromQuery(ti.term.field() + ":" + ti.term.text(), getLuceneQuery( facetDef.getQueryFragment() + " " + ti.term.field() + ":" + ti.term.text()))); if (LOG.isInfoEnabled()) { LOG.info("Create facet class: " + fClasses.get(fClasses.size() - 1) + " in " + (System.currentTimeMillis() - start) + " ms."); } } } } } catch (ParseException e) { LOG.error("Error producing facet classes from facet '" + facetDef.getName() + "'.", e); } catch (Exception e) { LOG.error("Error producing facet classes from facet '" + facetDef.getName() + "'.", e); } return fClasses; }
From source file:io.anserini.qa.passage.IdfPassageScorer.java
License:Apache License
@Override public void score(String query, Map<String, Float> sentences) throws Exception { // EnglishAnalyzer ea = new EnglishAnalyzer(StopFilter.makeStopSet(stopWords)); EnglishAnalyzer ea = new EnglishAnalyzer(CharArraySet.EMPTY_SET); QueryParser qp = new QueryParser(LuceneDocumentGenerator.FIELD_BODY, ea); ClassicSimilarity similarity = new ClassicSimilarity(); String escapedQuery = qp.escape(query); Query question = qp.parse(escapedQuery); HashSet<String> questionTerms = new HashSet<>( Arrays.asList(question.toString().trim().toLowerCase().split("\\s+"))); // add the question terms to the termIDF Map for (String questionTerm : questionTerms) { try {/*from w w w . j av a2s .c o m*/ TermQuery q = (TermQuery) qp.parse(questionTerm); Term t = q.getTerm(); double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs()); termIdfMap.put(questionTerm, String.valueOf(termIDF)); } catch (Exception e) { continue; } } // avoid duplicate passages HashSet<String> seenSentences = new HashSet<>(); for (Map.Entry<String, Float> sent : sentences.entrySet()) { double idf = 0.0; HashSet<String> seenTerms = new HashSet<>(); String[] terms = sent.getKey().toLowerCase().split("\\s+"); for (String term : terms) { try { TermQuery q = (TermQuery) qp.parse(term); Term t = q.getTerm(); double termIDF = similarity.idf(reader.docFreq(t), reader.numDocs()); termIdfMap.put(term, String.valueOf(termIDF)); if (questionTerms.contains(t.toString()) && !seenTerms.contains(t.toString())) { idf += termIDF; seenTerms.add(t.toString()); } else { idf += 0.0; } } catch (Exception e) { continue; } } double weightedScore = idf + 0.0001 * sent.getValue(); ScoredPassage scoredPassage = new ScoredPassage(sent.getKey(), weightedScore, sent.getValue()); if ((scoredPassageHeap.size() < topPassages || weightedScore > scoredPassageHeap.peekLast().getScore()) && !seenSentences.contains(sent)) { if (scoredPassageHeap.size() == topPassages) { scoredPassageHeap.pollLast(); } scoredPassageHeap.add(scoredPassage); seenSentences.add(sent.getKey()); } } }
From source file:newseman.TestSemanticTaggerIndexing.java
License:Apache License
public void testSemanticTokenFilter() throws IOException, ParseException { String text = "velk svtov revoluce byla velk jnov revoluce " + "s velkou extra jnovou revoluc"; Directory ramdir = new RAMDirectory(); Analyzer analyzer = new TestSemanticAnalyzer(); IndexWriter writer = new IndexWriter(ramdir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer)); Document doc = new Document(); Field field1 = newField("foo", text, TextField.TYPE_STORED); Field field2 = newField("foox", text, TextField.TYPE_STORED); doc.add(field1);/*from w w w. j a v a 2s. co m*/ doc.add(field2); writer.addDocument(doc); writer.close(); IndexSearcher ram = new IndexSearcher(DirectoryReader.open(ramdir)); QueryParser qp1 = new QueryParser(TEST_VERSION_CURRENT, "foo", analyzer); QueryParser qp2 = new QueryParser(TEST_VERSION_CURRENT, "foox", analyzer); TopDocs hits; hits = ram.search(qp1.parse("foo:XXX"), 10); assertTrue(hits.totalHits == 1); hits = ram.search(qp1.parse("foox:XXX"), 10); assertTrue(hits.totalHits == 0); // currently, each token is tokenized by the qparser // so we don't see them together Query q1 = qp1.parse("\"velk jnov revoluce\""); Query q2 = qp2.parse("\"velk jnov revoluce\""); assertTrue(!q1.equals(q2)); assertTrue(q1 instanceof MultiPhraseQuery); assertTrue(q2 instanceof PhraseQuery); MultiPhraseQuery mq = (MultiPhraseQuery) q1; List<Term[]> ta = mq.getTermArrays(); StringBuffer o = new StringBuffer(); for (int i = 0; i < ta.size(); i++) { for (Term t : ta.get(i)) { o.append(t.toString()); o.append(" "); } o.append("|"); } assertTrue(o.toString() .equals("foo:velk foo:velk jnov revoluce foo:XXX |foo:jnov |foo:revoluce |")); assertTrue(q1.toString().equals("foo:\"(velk velk jnov revoluce XXX) jnov revoluce\"")); assertTrue(q2.toString().equals("foox:\"velk jnov revoluce\"")); Set<Term> terms = new HashSet<Term>(); q1.extractTerms(terms); // extract only the 2nd (semantic) element q1 = qp1.parse("revoluce"); terms = new HashSet<Term>(); q1.extractTerms(terms); Term semQ = (Term) terms.toArray()[1]; String sem = semQ.text(); hits = ram.search(qp1.parse(sem), 10); assertTrue(hits.totalHits == 1); hits = ram.search(qp1.parse(semQ.toString()), 10); assertTrue(hits.totalHits == 1); hits = ram.search(qp2.parse(sem), 10); assertTrue(hits.totalHits == 0); ramdir.close(); }
From source file:org.apache.blur.manager.AliasBlurFilterCache.java
License:Apache License
private Filter buildNewFilter(Query query, ConcurrentMap<String, String> filterAlias, FilterParser filterParser) throws ParseException { if (query instanceof BooleanQuery) { BooleanQuery booleanQuery = (BooleanQuery) query; BooleanFilter booleanFilter = new BooleanFilter(); for (BooleanClause clause : booleanQuery.clauses()) { booleanFilter.add(buildNewFilter(clause.getQuery(), filterAlias, filterParser), clause.getOccur()); }/* w w w. j a v a 2s. co m*/ return booleanFilter; } else if (query instanceof TermQuery) { TermQuery termQuery = (TermQuery) query; Term term = termQuery.getTerm(); String key = term.toString(); String queryStr = filterAlias.get(key); if (queryStr == null) { return new QueryWrapperFilter(termQuery); } String id = getId(key); return new FilterCache(id, new QueryWrapperFilter(filterParser.parse(queryStr))); } else { return new QueryWrapperFilter(query); } }
From source file:org.apache.jetspeed.services.search.lucene.LuceneSearchService.java
License:Apache License
/** * // w w w . j av a2s . c om * @see org.apache.jetspeed.services.search.SearchService#remove(java.lang.Collection) * @param c * @return */ public boolean remove(Collection c) { boolean result = false; try { IndexReader indexReader = IndexReader.open(this.rootDir); Iterator it = c.iterator(); while (it.hasNext()) { Object o = it.next(); // Look up appropriate handler ObjectHandler handler = HandlerFactory.getHandler(o); // Parse the object ParsedObject parsedObject = handler.parseObject(o); // Create term Term term = null; if (parsedObject.getKey() != null) { term = new Term(ParsedObject.FIELDNAME_KEY, parsedObject.getKey()); // Remove the document from search index int rc = indexReader.delete(term); logger.info( "Attempted to delete '" + term.toString() + "' from index, documents deleted = " + rc); //System.out.println("Attempted to delete '" + term.toString() + "' from index, documents deleted = " + rc); result = rc > 0; } } indexReader.close(); IndexWriter indexWriter = new IndexWriter(rootDir, new StandardAnalyzer(), false); indexWriter.optimize(); indexWriter.close(); } catch (Exception e) { logger.error("Exception", e); result = false; } return result; }
From source file:org.apache.solr.search.stats.ExactStatsCache.java
License:Apache License
@Override public void returnLocalStats(ResponseBuilder rb, SolrIndexSearcher searcher) { Query q = rb.getQuery();/*ww w.ja va 2 s . c o m*/ try { q = q.rewrite(searcher.getIndexReader()); HashSet<Term> terms = new HashSet<Term>(); q.extractTerms(terms); IndexReaderContext context = searcher.getTopReaderContext(); HashMap<String, TermStats> statsMap = new HashMap<String, TermStats>(); HashMap<String, CollectionStats> colMap = new HashMap<String, CollectionStats>(); for (Term t : terms) { TermContext termContext = TermContext.build(context, t); TermStatistics tst = searcher.localTermStatistics(t, termContext); if (tst.docFreq() == 0) { // skip terms that are not present here continue; } statsMap.put(t.toString(), new TermStats(t.field(), tst)); rb.rsp.add(TERMS_KEY, t.toString()); if (!colMap.containsKey(t.field())) { // collection stats for this field colMap.put(t.field(), new CollectionStats(searcher.localCollectionStatistics(t.field()))); } } String termStatsString = StatsUtil.termStatsMapToString(statsMap); rb.rsp.add(TERM_STATS_KEY, termStatsString); String colStatsString = StatsUtil.colStatsMapToString(colMap); rb.rsp.add(COL_STATS_KEY, colStatsString); if (LOG.isDebugEnabled()) { LOG.debug("termStats=" + termStatsString + ", collectionStats=" + colStatsString + ", terms=" + terms + ", numDocs=" + searcher.maxDoc()); } } catch (IOException e) { LOG.error("Error collecting local stats, query='" + q.toString() + "'", e); throw new SolrException(ErrorCode.SERVER_ERROR, "Error collecting local stats.", e); } }
From source file:org.apache.solr.search.stats.StatsUtil.java
License:Apache License
private static TermStats termStatsFromString(String data, Term t) { if (data == null || data.trim().length() == 0) { LOG.warn("Invalid empty term stats string"); return null; }/*from ww w .j a v a 2s . c o m*/ String[] vals = data.split(","); if (vals.length < 2) { LOG.warn("Invalid term stats string, num fields " + vals.length + " < 2, '" + data + "'"); return null; } Term termToUse; int idx = 0; if (vals.length == 3) { idx++; // with term Term term = termFromString(vals[0]); if (term != null) { termToUse = term; if (t != null) { assert term.equals(t); } } else { // failed term decoding termToUse = t; } } else { termToUse = t; } if (termToUse == null) { LOG.warn("Missing term in termStats '" + data + "'"); return null; } try { long docFreq = Long.parseLong(vals[idx++]); long totalTermFreq = Long.parseLong(vals[idx]); return new TermStats(termToUse.toString(), docFreq, totalTermFreq); } catch (Exception e) { LOG.warn("Invalid termStats string '" + data + "'"); return null; } }