List of usage examples for org.apache.lucene.index Term bytes
BytesRef bytes
To view the source code for org.apache.lucene.index Term bytes.
Click Source Link
From source file:SimpleNaiveBayesClassifier.java
License:Apache License
/** * Calculate probabilities for all classes for a given input text * @param inputDocument the input text as a {@code String} * @return a {@code List} of {@code ClassificationResult}, one for each existing class * @throws IOException if assigning probabilities fails *//*from ww w . j a va2 s . com*/ protected List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException { List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>(); Terms classes = MultiFields.getTerms(leafReader, classFieldName); TermsEnum classesEnum = classes.iterator(); BytesRef next; String[] tokenizedText = tokenize(inputDocument); int docsWithClassSize = countDocsWithClass(); while ((next = classesEnum.next()) != null) { if (next.length > 0) { Term term = new Term(this.classFieldName, next); double clVal = calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(tokenizedText, term, docsWithClassSize); assignedClasses.add(new ClassificationResult<>(term.bytes(), clVal)); } } // normalization; the values transforms to a 0-1 range return normClassificationResults(assignedClasses); }
From source file:SimpleNaiveBayesDocumentClassifier.java
License:Apache License
private List<ClassificationResult<BytesRef>> assignNormClasses(Document inputDocument) throws IOException { List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>(); Map<String, List<String[]>> fieldName2tokensArray = new LinkedHashMap<>(); Map<String, Float> fieldName2boost = new LinkedHashMap<>(); Terms classes = MultiFields.getTerms(leafReader, classFieldName); TermsEnum classesEnum = classes.iterator(); BytesRef c;/* ww w.ja v a2 s . c om*/ analyzeSeedDocument(inputDocument, fieldName2tokensArray, fieldName2boost); int docsWithClassSize = countDocsWithClass(); while ((c = classesEnum.next()) != null) { double classScore = 0; Term term = new Term(this.classFieldName, c); for (String fieldName : textFieldNames) { List<String[]> tokensArrays = fieldName2tokensArray.get(fieldName); double fieldScore = 0; for (String[] fieldTokensArray : tokensArrays) { fieldScore += calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(fieldTokensArray, fieldName, term, docsWithClassSize) * fieldName2boost.get(fieldName); } classScore += fieldScore; } assignedClasses.add(new ClassificationResult<>(term.bytes(), classScore)); } return normClassificationResults(assignedClasses); }
From source file:BlockBuilding.AbstractSortedNeighborhoodBlocking.java
License:Open Source License
protected List<Integer> getTermEntities(int[] docIds, IndexReader iReader, String blockingKey) { try {/* w w w .j av a 2 s . c om*/ Term term = new Term(VALUE_LABEL, blockingKey); List<Integer> entityIds = new ArrayList<>(); int docFrequency = iReader.docFreq(term); if (0 < docFrequency) { BytesRef text = term.bytes(); DocsEnum de = MultiFields.getTermDocsEnum(iReader, MultiFields.getLiveDocs(iReader), VALUE_LABEL, text); int doc; while ((doc = de.nextDoc()) != DocsEnum.NO_MORE_DOCS) { entityIds.add(docIds[doc]); } } return entityIds; } catch (IOException ex) { ex.printStackTrace(); return null; } }
From source file:BlockBuilding.SortedNeighborhoodBlocking.java
License:Apache License
protected List<Integer> getTermEntities(int[] docIds, IndexReader iReader, String blockingKey) { try {//from www. jav a 2 s. c o m Term term = new Term(VALUE_LABEL, blockingKey); List<Integer> entityIds = new ArrayList<>(); int docFrequency = iReader.docFreq(term); if (0 < docFrequency) { BytesRef text = term.bytes(); PostingsEnum pe = MultiFields.getTermDocsEnum(iReader, VALUE_LABEL, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(docIds[doc]); } } return entityIds; } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); return null; } }
From source file:br.ufmt.periscope.indexer.resources.search.FastJoinTermEnum.java
public FastJoinTermEnum(TermsEnum tenum, Term name, FuzzyTokenSimilarity ts) { super(tenum); this.name = name.bytes(); this.ts = ts; setInitialSeekTerm(this.name); }
From source file:com.ifactory.press.db.solr.HitCount.java
License:Apache License
@Override public ValueSource parse(FunctionQParser fp) throws SyntaxError { // hitcount() takes no arguments. If we wanted to pass a query // we could call fp.parseNestedQuery() HashSet<String> fields = new HashSet<String>(); while (fp.hasMoreArguments()) { fields.add(fp.parseArg());/*from w w w.ja va2 s.c om*/ } Query q = fp.subQuery(fp.getParams().get("q"), "lucene").getQuery(); HashSet<Term> terms = new HashSet<Term>(); try { q.extractTerms(terms); } catch (UnsupportedOperationException e) { return new DoubleConstValueSource(1); } ArrayList<ValueSource> termcounts = new ArrayList<ValueSource>(); for (Term t : terms) { if (fields.isEmpty() || fields.contains(t.field())) { termcounts.add(new TermFreqValueSource(t.field(), t.text(), t.field(), t.bytes())); } } return new SumFloatFunction(termcounts.toArray(new ValueSource[termcounts.size()])); }
From source file:com.joliciel.jochre.search.highlight.LuceneQueryHighlighter.java
License:Open Source License
public LuceneQueryHighlighter(JochreQuery jochreQuery, IndexSearcher indexSearcher) { try {//from w ww.ja v a 2s . c o m this.indexSearcher = indexSearcher; this.jochreQuery = jochreQuery; query = rewrite(jochreQuery.getLuceneQuery()); queryTerms = new TreeSet<Term>(); query.extractTerms(queryTerms); if (LOG.isTraceEnabled()) queryTermList = new ArrayList<Term>(queryTerms); final IndexReader reader = indexSearcher.getIndexReader(); // add 1 to doc count to ensure even terms in all docs get a very small weight docCountLog = Math.log(reader.numDocs() + 1); IndexReaderContext readerContext = reader.getContext(); leaves = readerContext.leaves(); // since the same terms might be contained in the query multiple times (e.g. once per field) // we only consider them once each by using a HashSet terms = new HashSet<BytesRef>(); Map<BytesRef, Integer> termFreqs = new HashMap<BytesRef, Integer>(); for (Term term : queryTerms) { terms.add(term.bytes()); termFreqs.put(term.bytes(), 0); } termLogs = new HashMap<BytesRef, Double>(); for (Term term : queryTerms) { int freq = termFreqs.get(term.bytes()); freq += reader.docFreq(term); termFreqs.put(term.bytes(), freq); } for (BytesRef term : terms) { int freq = termFreqs.get(term); termLogs.put(term, Math.log(freq)); } } catch (IOException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } }
From source file:com.lucene.MyPrefixQuery.java
License:Apache License
public MyPrefixQuery(Term prefix) { super(prefix, toAutomaton(prefix.bytes()), Integer.MAX_VALUE, true); if (prefix == null) { throw new NullPointerException("prefix must not be null"); }//from ww w. j av a 2 s.c om }
From source file:com.meltwater.elasticsearch.search.suggest.PrefixSuggester.java
License:Apache License
@Override protected Suggest.Suggestion<Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option>> innerExecute( String name, PrefixSuggestionContext suggestion, IndexReader indexReader, CharsRefBuilder spare) throws IOException { List<Term> analyzed = queryTerms(suggestion, spare); //If there is more than one parameter, return it. We can't do accurate phrase suggestions cheaply, so we don't. if (analyzed.size() != 1) { return null; }//w ww . j a va 2s .c om Term text = analyzed.get(0); //E.g. "a" or "" (empty string) would list more or less all terms in the dictionary. Don't wanna do that. if (text.bytes().length < suggestion.getMinPrefixLength()) { return null; } PrefixTermsEnum prefixTermsEnum = terms(indexReader, text); //E.g. there is no field or no prefix like this if (prefixTermsEnum == null) return null; final int size = suggestion.getSize(); return singleEntrySuggestion(name, size, getOptions(text, prefixTermsEnum, indexReader, size)); }
From source file:com.meltwater.elasticsearch.search.suggest.PrefixSuggester.java
License:Apache License
private Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> getOptions(Term text, PrefixTermsEnum prefixTermsEnum, IndexReader indexReader, final int size) throws IOException { OptionQueue collectionQueue = new OptionQueue(size); BytesRef ref;/*from w w w .jav a 2 s.c o m*/ while ((ref = prefixTermsEnum.next()) != null) { Term term = new Term(text.field(), BytesRef.deepCopyOf(ref)); collectionQueue.insertWithOverflow(new Suggest.Suggestion.Entry.Option( new StringText(term.bytes().utf8ToString()), indexReader.docFreq(term))); } Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> entry = new Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option>( new StringText(text.text()), 0, text.bytes().length); while (collectionQueue.size() > 0) { entry.addOption(collectionQueue.pop()); } return entry; }