List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:org.opengrok.suggest.SuggesterSearcher.java
License:Open Source License
private List<LookupResultItem> suggest(final Query query, final LeafReaderContext leafReaderContext, final String project, final SuggesterQuery suggesterQuery, final PopularityCounter searchCounts) throws IOException { if (Thread.currentThread().isInterrupted()) { interrupted = true;/*from w w w . j a va 2 s. c om*/ return Collections.emptyList(); } boolean shouldLeaveOutSameTerms = shouldLeaveOutSameTerms(query, suggesterQuery); Set<BytesRef> tokensAlreadyIncluded = null; if (shouldLeaveOutSameTerms) { tokensAlreadyIncluded = SuggesterUtils.intoTermsExceptPhraseQuery(query).stream() .filter(t -> t.field().equals(suggesterQuery.getField())).map(Term::bytes) .collect(Collectors.toSet()); } boolean needsDocumentIds = query != null && !(query instanceof MatchAllDocsQuery); ComplexQueryData complexQueryData = null; if (needsDocumentIds) { complexQueryData = getComplexQueryData(query, leafReaderContext); if (interrupted) { return Collections.emptyList(); } } Terms terms = leafReaderContext.reader().terms(suggesterQuery.getField()); TermsEnum termsEnum = suggesterQuery.getTermsEnumForSuggestions(terms); LookupPriorityQueue queue = new LookupPriorityQueue(resultSize); boolean needPositionsAndFrequencies = needPositionsAndFrequencies(query); PostingsEnum postingsEnum = null; BytesRef term = termsEnum.next(); while (term != null) { if (Thread.currentThread().isInterrupted()) { interrupted = true; break; } if (needPositionsAndFrequencies) { postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.POSITIONS | PostingsEnum.FREQS); } else { postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); } int score; if (!needsDocumentIds) { score = normalizeDocumentFrequency(termsEnum.docFreq(), numDocs); } else if (needPositionsAndFrequencies) { score = getPhraseScore(complexQueryData, leafReaderContext.docBase, postingsEnum); } else { score = getDocumentFrequency(complexQueryData.documentIds, leafReaderContext.docBase, postingsEnum); } if (score > 0) { if (!shouldLeaveOutSameTerms || !tokensAlreadyIncluded.contains(term)) { score += searchCounts.get(term) * TERM_ALREADY_SEARCHED_MULTIPLIER; if (queue.canInsert(score)) { queue.insertWithOverflow(new LookupResultItem(term.utf8ToString(), project, score)); } } } term = termsEnum.next(); } return queue.getResult(); }
From source file:org.opensextant.solrtexttagger.Tagger.java
License:Open Source License
public void process() throws IOException { if (terms == null) return;// w w w. j a va 2s . co m //a shared pointer to the head used by this method and each Tag instance. final TagLL[] head = new TagLL[1]; TermPrefixCursor cursor = null;//re-used TermsEnum termsEnum = null;//re-used //boolean switch used to log warnings in case tokens where skipped during //tagging. boolean skippedTokens = false; while (tokenStream.incrementToken()) { if (log.isTraceEnabled()) { log.trace("Token: {}, posInc: {}, offset: [{},{}]", byteRefAtt, posIncAtt.getPositionIncrement(), offsetAtt.startOffset(), offsetAtt.endOffset()); } //check for posInc < 1 (alternate Tokens, such as expanded Synonyms) if (posIncAtt.getPositionIncrement() < 1) { //(a) Deal with this as a configuration issue and throw an exception if (!skipAltTokens) { //TODO throw UnsupportedTokenException when PhraseBuilder is ported throw new IllegalStateException("Query Analyzer generates alternate " + "Tokens (posInc == 0). Please adapt your Analyzer configuration or " + "enable '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' to skip such " + "tokens. NOTE: enabling '" + TaggerRequestHandler.SKIP_ALT_TOKENS + "' might result in wrong tagging results if the index time analyzer " + "is not configured accordingly. For detailed information see " + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225"); } else { //(b) In case the index time analyser had indexed all variants (users // need to ensure that) processing of alternate tokens can be skipped // as anyways all alternatives will be contained in the FST. skippedTokens = true; log.trace(" ... ignored token"); continue; } } //-- If PositionIncrement > 1 (stopwords) if (!ignoreStopWords && posIncAtt.getPositionIncrement() > 1) { log.trace(" - posInc > 1 ... mark cluster as done"); advanceTagsAndProcessClusterIfDone(head, null); } final BytesRef term; //NOTE: we need to lookup tokens if // * the LookupAtt is true OR // * there are still advancing tags (to find the longest possible match) if (lookupAtt.isTaggable() || head[0] != null) { //-- Lookup the term id from the next token term = byteRefAtt.getBytesRef(); if (term.length == 0) { throw new IllegalArgumentException( "term: " + term.utf8ToString() + " analyzed to a zero-length token"); } } else { //no current cluster AND lookup == false ... term = null; //skip this token } //-- Process tag advanceTagsAndProcessClusterIfDone(head, term); //-- only create new Tags for Tokens we need to lookup if (lookupAtt.isTaggable() && term != null) { //determine if the terms index has a term starting with the provided term // TODO cache hashcodes of valid first terms (directly from char[]?) to skip lookups? termsEnum = terms.iterator(); if (cursor == null)//re-usable cursor = new TermPrefixCursor(termsEnum, liveDocs, docIdsCache); if (cursor.advance(term)) { TagLL newTail = new TagLL(head, cursor, offsetAtt.startOffset(), offsetAtt.endOffset(), null); termsEnum = null;//because the cursor now "owns" this instance cursor = null;//because the new tag now "owns" this instance //and add it to the end if (head[0] == null) { head[0] = newTail; } else { for (TagLL t = head[0]; true; t = t.nextTag) { if (t.nextTag == null) { t.addAfterLL(newTail); break; } } } } } //if termId >= 0 } //end while(incrementToken()) //-- Finish all tags advanceTagsAndProcessClusterIfDone(head, null); assert head[0] == null; if (!loggedSkippedAltTokenWarning && skippedTokens) { loggedSkippedAltTokenWarning = true; //only log once log.warn("The Tagger skiped some alternate tokens (tokens with posInc == 0) " + "while processing text. This may cause problems with some Analyer " + "configurations (e.g. query time synonym expansion). For details see " + "https://github.com/OpenSextant/SolrTextTagger/pull/11#issuecomment-24936225"); } tokenStream.end(); //tokenStream.close(); caller closes because caller acquired it }
From source file:org.pageseeder.flint.lucene.search.Terms.java
License:Apache License
/** * Loads all the fuzzy terms in the list of terms given the reader. * * @param reader Index reader to use.// w ww . j av a2 s .c o m * @param values The list of terms to load. * @param term The term to use. * * @throws IOException If an error is thrown by the fuzzy term enumeration. */ public static void fuzzy(IndexReader reader, List<String> values, Term term, int minSimilarity) throws IOException { AttributeSource atts = new AttributeSource(); Fields fields = MultiFields.getFields(reader); org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field()); if (terms == null) return; FuzzyTermsEnum fuzzy = new FuzzyTermsEnum(terms, atts, term, minSimilarity, 0, false); BytesRef val; BytesRef searched = term.bytes(); while ((val = fuzzy.next()) != null) { if (!searched.bytesEquals(val)) values.add(val.utf8ToString()); } }
From source file:org.pageseeder.flint.lucene.search.Terms.java
License:Apache License
/** * Loads all the prefix terms in the list of terms given the reader. * * @param reader Index reader to use./*from ww w.java 2 s. c o m*/ * @param values The list of values to load. * @param term The term to use. * * @throws IOException If an error is thrown by the prefix term enumeration. */ public static void prefix(IndexReader reader, List<String> values, Term term) throws IOException { Fields fields = MultiFields.getFields(reader); org.apache.lucene.index.Terms terms = fields == null ? null : fields.terms(term.field()); if (terms == null) return; TermsEnum prefixes = terms.intersect(new CompiledAutomaton(PrefixQuery.toAutomaton(term.bytes())), null); BytesRef val; while ((val = prefixes.next()) != null) { values.add(val.utf8ToString()); } }
From source file:org.pageseeder.flint.lucene.search.Terms.java
License:Apache License
/** * Returns the list of term values for the specified field. * * @param reader The index reader to use * @param field The field/* w w w . j av a 2 s. c om*/ * * @return the list of terms for this field * * @throws IOException should any IO error be reported. */ @Beta public static List<String> values(IndexReader reader, String field) throws IOException { LOGGER.debug("Loading term values for field {}", field); List<String> values = new ArrayList<String>(); org.apache.lucene.index.Terms terms = MultiFields.getTerms(reader, field); if (terms == null) return values; TermsEnum termsEnum = terms.iterator(); if (termsEnum == TermsEnum.EMPTY) return values; while (termsEnum.next() != null) { BytesRef t = termsEnum.term(); if (t == null) break; values.add(t.utf8ToString()); } return values; }
From source file:org.solr.classtify.SimpleNaiveBayesClassifierTest.java
License:Apache License
@Test public void classtify() throws IOException { SimpleNaiveBayesClassifier classifier = new SimpleNaiveBayesClassifier(); IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(dir))); AtomicReader wrap = SlowCompositeReaderWrapper.wrap(reader); classifier.train(wrap, textFieldName, categoryFieldName, analyzer); ClassificationResult<BytesRef> assignClass = classifier.assignClass(newText); BytesRef assignedClass = assignClass.getAssignedClass(); double score = assignClass.getScore(); System.out.println(assignedClass.utf8ToString() + "," + score); }
From source file:org.splevo.vpm.analyzer.semantic.lucene.finder.SharedTermFinder.java
License:Open Source License
/** * Determine the terms shared by the related variation points by looking up all terms included * in the search query AND a found document. * * @param referenceDocTerms/* w ww . j a va 2s .c o m*/ * The terms of the reference doc and used in the search query. * @param foundDoc * A specific document found by the query. * @param foundDocId * The id of the document found to get it's index terms. * @param field * The field to get the terms for. * * @return The {@link Set} of terms shared between the query and the document. * @throws IOException */ private Set<String> determineSharedTerms(Set<Term> referenceDocTerms, Document foundDoc, int foundDocId, String field) throws IOException { Set<String> sharedTerms = new TreeSet<String>(); Terms termVector = reader.getTermVector(foundDocId, field); TermsEnum termsEnum = null; TermsEnum iterator = termVector.iterator(termsEnum); BytesRef br = null; while ((br = iterator.next()) != null) { String term = br.utf8ToString(); for (Term t : referenceDocTerms) { if (t.text().equals(term)) { sharedTerms.add(term); } } } return sharedTerms; }
From source file:org.splevo.vpm.analyzer.semantic.lucene.finder.SharedTermFinder.java
License:Open Source License
/** * Extracts the frequencies of all {@link Term}s in the specified {@link Document}. Uses the * member reader./*from w ww.ja v a 2 s. c o m*/ * * @param docId * The ID of the {@link Document} to extract the {@link Term}s from. * @param fieldName * The name of the field to extract frequencies from. * @return A {@link Map} containing the terms as the key and the related frequencies as * {@link Integer} value. */ private Map<String, Integer> getTermFrequencies(int docId, String fieldName) { Map<String, Integer> frequencies = new HashMap<String, Integer>(); try { Terms vector = reader.getTermVector(docId, fieldName); if (vector == null) { return frequencies; } TermsEnum termsEnum = null; termsEnum = vector.iterator(termsEnum); BytesRef text = null; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int freq = (int) termsEnum.totalTermFreq(); frequencies.put(term, freq); } } catch (IOException e) { logger.error("Failure while extracting Term Frequencies."); } return frequencies; }
From source file:org.splevo.vpm.analyzer.semantic.SemanticVPMAnalyzer.java
License:Open Source License
private Map<String, Integer> getTermsFromIndex() { Map<String, Integer> indexedTerms = Maps.newLinkedHashMap(); try {/*from ww w . j a va2s.c o m*/ DirectoryReader indexReader = indexer.getIndexReader(); Terms terms = SlowCompositeReaderWrapper.wrap(indexReader).terms(Indexer.INDEX_CONTENT); if (terms == null) { return indexedTerms; } TermsEnum termEnum = terms.iterator(null); BytesRef byteRef = null; while ((byteRef = termEnum.next()) != null) { String term = byteRef.utf8ToString(); int count = indexReader.docFreq(new Term(Indexer.INDEX_CONTENT, byteRef)); indexedTerms.put(term, Integer.valueOf(count)); } indexReader.close(); } catch (Exception e) { logger.error("Failed to dump index", e); } return indexedTerms; }
From source file:org.tallison.gramreaper.terms.DumpTerms.java
License:Apache License
private void dumpTopNField(LeafReader leafReader, String field) throws IOException { AbstractTokenTFDFPriorityQueue queue = config.sort.equals(DumpTermsConfig.SORT.DF) ? new TokenDFPriorityQueue(config.topN) : new TokenTFPriorityQueue(config.topN); Terms terms = leafReader.terms(field); if (terms == null) { StringBuilder sb = new StringBuilder(); int i = 0; for (FieldInfo fieldInfo : leafReader.getFieldInfos()) { if (i++ > 0) { sb.append("\n"); }/*www . j a v a2 s. c o m*/ sb.append(fieldInfo.name); } throw new RuntimeException("I can't find field \"" + field + "\".\n" + "I only see:\n" + sb.toString()); } TermsEnum termsEnum = terms.iterator(); BytesRef bytesRef = termsEnum.next(); int docsWThisField = leafReader.getDocCount(field); while (bytesRef != null) { int df = termsEnum.docFreq(); long tf = termsEnum.totalTermFreq(); if (config.minDocFreq > -1 && df < config.minDocFreq) { bytesRef = termsEnum.next(); continue; } if (config.minDocPercentage > -1.0d && (double) df / (double) docsWThisField < config.minDocPercentage) { bytesRef = termsEnum.next(); continue; } if (queue.top() == null || queue.size() < config.topN || (config.sort.equals(DumpTermsConfig.SORT.DF) ? df >= queue.top().df : tf > queue.top().tf)) { String t = bytesRef.utf8ToString(); if (!config.stopWords.contains(t) && !config.startWords.contains(t)) { queue.insertWithOverflow(new TokenDFTF(t, df, tf)); } } bytesRef = termsEnum.next(); } if (config.outputFile == null) { StringBuilder sb = new StringBuilder(); for (TokenDFTF tp : queue.getArray()) { System.out.println(getRow(sb, tp)); } } else if (Files.isDirectory(config.outputFile)) { writeTopN(config.outputFile.resolve(field), queue); } else { writeTopN(config.outputFile, queue); } }