List of usage examples for org.apache.lucene.util CharsRefBuilder copyUTF8Bytes
public void copyUTF8Bytes(BytesRef bytes)
From source file:alix.lucene.MoreLikeThis.java
License:Apache License
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field *//*w w w . j av a2 s . c o m*/ private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } final int freq = (int) termsEnum.totalTermFreq(); // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
From source file:alix.lucene.MoreLikeThis.java
License:Apache License
/** * Print a term vector for debugging//from ww w. j a v a 2 s. c om * * @param vector List of terms and their frequencies for a doc/field * @throws IOException */ @SuppressWarnings("unused") private void print(Terms vector) throws IOException { if (vector == null) return; final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; // termsEnum.docFreq() = 1, // The returned Fields instance acts like a single-document inverted index HashMap<String, Long> map = new HashMap<String, Long>(); while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); map.put(spare.toString(), termsEnum.totalTermFreq()); } @SuppressWarnings("unchecked") Map.Entry<String, Long>[] a = map.entrySet().toArray(new Map.Entry[0]); Arrays.sort(a, new Comparator<Map.Entry<String, Long>>() { public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) { return o2.getValue().compareTo(o1.getValue()); } }); for (Map.Entry<String, Long> e : a) { System.out.print(e.getKey() + ":" + e.getValue() + " "); } System.out.println(); }
From source file:com.nlp.mlt.SimQuery.java
License:Apache License
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field */// w w w.j ava2 s . c om private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } final int freq = (int) termsEnum.totalTermFreq(); System.out.println(term + " = " + freq); // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java
License:Open Source License
private static SimpleOrderedMap<Object> getDocumentFieldsInfo(Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException { final CharsRefBuilder spare = new CharsRefBuilder(); SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<>(); for (Object o : doc.getFields()) { Field field = (Field) o; SimpleOrderedMap<Object> f = new SimpleOrderedMap<>(); SchemaField sfield = schema.getFieldOrNull(field.name()); FieldType ftype = (sfield == null) ? null : sfield.getType(); f.add("type", (ftype == null) ? null : ftype.getTypeName()); f.add("schema", getFieldFlags(sfield)); f.add("flags", getFieldFlags(field)); Term t = new Term(field.name(), ftype != null ? ftype.storedToIndexed(field) : field.stringValue()); f.add("value", (ftype == null) ? null : ftype.toExternal(field)); // TODO: this really should be "stored" f.add("internal", field.stringValue()); // may be a binary number BytesRef bytes = field.binaryValue(); if (bytes != null) { f.add("binary", Base64.byteArrayToBase64(bytes.bytes, bytes.offset, bytes.length)); }// w ww .j a v a 2s. com f.add("boost", field.boost()); f.add("docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this // can // be 0 // for // non-indexed // fields // If we have a term vector, return that if (field.fieldType().storeTermVectors()) { try { Terms v = reader.getTermVector(docId, field.name()); if (v != null) { SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<>(); final TermsEnum termsEnum = v.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { final int freq = (int) termsEnum.totalTermFreq(); spare.copyUTF8Bytes(text); tfv.add(spare.toString(), freq); } f.add("termVector", tfv); } } catch (Exception ex) { log.warn("error writing term vector", ex); } } finfo.add(field.name(), f); } return finfo; }
From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java
License:Open Source License
@SuppressWarnings("unchecked") private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap) throws IOException { SolrParams params = req.getParams(); final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT); TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to // collect the top N // terms in./* ww w . j a va2 s . c om*/ final CharsRefBuilder spare = new CharsRefBuilder(); Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(), field); if (terms == null) { // field does not exist return; } TermsEnum termsEnum = terms.iterator(); BytesRef text; int[] buckets = new int[HIST_ARRAY_SIZE]; while ((text = termsEnum.next()) != null) { ++tiq.distinctTerms; int freq = termsEnum.docFreq(); // This calculation seems odd, but // it gives the same results as it // used to. int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1)); buckets[slot] = buckets[slot] + 1; if (numTerms > 0 && freq > tiq.minFreq) { spare.copyUTF8Bytes(text); String t = spare.toString(); tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq())); if (tiq.size() > numTerms) { // if tiq full tiq.pop(); // remove lowest in tiq tiq.minFreq = tiq.getTopTermInfo().docFreq; } } } tiq.histogram.add(buckets); fieldMap.add("distinct", tiq.distinctTerms); // Include top terms fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema())); // Add a histogram fieldMap.add("histogram", tiq.histogram.toNamedList()); }
From source file:org.codelibs.elasticsearch.common.lucene.search.XMoreLikeThis.java
License:Apache License
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field * @param fieldName Optional field name of the terms for skip terms *//* w w w . jav a 2 s . co m*/ private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } if (isSkipTerm(fieldName, term)) { continue; } final PostingsEnum docs = termsEnum.postings(null); int freq = 0; while (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { freq += docs.freq(); } // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
From source file:org.codelibs.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.java
License:Apache License
public static int analyze(Analyzer analyzer, BytesRef toAnalyze, String field, TokenConsumer consumer, CharsRefBuilder spare) throws IOException { spare.copyUTF8Bytes(toAnalyze); CharsRef charsRef = spare.get();/*from ww w .j a v a 2 s . c o m*/ try (TokenStream ts = analyzer.tokenStream(field, new FastCharArrayReader(charsRef.chars, charsRef.offset, charsRef.length))) { return analyze(ts, consumer); } }
From source file:org.codelibs.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.java
License:Apache License
public TokenStream tokenStream(Analyzer analyzer, BytesRef query, CharsRefBuilder spare, String field) throws IOException { spare.copyUTF8Bytes(query); return analyzer.tokenStream(field, new FastCharArrayReader(spare.chars(), 0, spare.length())); }
From source file:org.elasticsearch.action.allterms.TransportAllTermsShardAction.java
License:Apache License
@Override protected AllTermsSingleShardResponse shardOperation(AllTermsShardRequest request, ShardId shardId) throws ElasticsearchException { List<String> terms = new ArrayList<>(); IndexService indexService = indicesService.indexServiceSafe(request.index()); IndexShard indexShard = indexService.shardSafe(shardId.id()); final Engine.Searcher searcher = indexShard.acquireSearcher("all_terms"); IndexReader topLevelReader = searcher.reader(); List<AtomicReaderContext> leaves = topLevelReader.leaves(); try {/*from w w w . j av a 2s . c o m*/ if (leaves.size() == 0) { return new AllTermsSingleShardResponse(terms); } List<TermsEnum> termIters = new ArrayList<>(); try { for (AtomicReaderContext reader : leaves) { termIters.add(reader.reader().terms(request.field()).iterator(null)); } } catch (IOException e) { } CharsRefBuilder spare = new CharsRefBuilder(); BytesRef lastTerm = null; int[] exhausted = new int[termIters.size()]; for (int i = 0; i < exhausted.length; i++) { exhausted[i] = 0; } try { //first find smallest term for (int i = 0; i < termIters.size(); i++) { BytesRef curTerm = null; if (request.from() != null) { TermsEnum.SeekStatus seekStatus = termIters.get(i).seekCeil(new BytesRef(request.from())); if (seekStatus.equals(TermsEnum.SeekStatus.END) == false) { curTerm = termIters.get(i).term(); } } else { curTerm = termIters.get(i).next(); } if (lastTerm == null) { lastTerm = curTerm; if (lastTerm == null || lastTerm.length == 0) { lastTerm = null; exhausted[i] = 1; } } else { if (curTerm.compareTo(lastTerm) < 0) { lastTerm = curTerm; } } } if (lastTerm == null) { return new AllTermsSingleShardResponse(terms); } if (getDocFreq(termIters, lastTerm, request.field(), exhausted) >= request.minDocFreq()) { spare.copyUTF8Bytes(lastTerm); terms.add(spare.toString()); } BytesRef blah = new BytesRef(); blah.copyBytes(lastTerm); lastTerm = blah; while (terms.size() < request.size() && lastTerm != null) { moveIterators(exhausted, termIters, lastTerm, shardId); lastTerm = findMinimum(exhausted, termIters, shardId); if (lastTerm != null) { if (getDocFreq(termIters, lastTerm, request.field(), exhausted) >= request.minDocFreq()) { spare.copyUTF8Bytes(lastTerm); terms.add(spare.toString()); } } } } catch (IOException e) { } logger.trace("[{}], final terms list: {}", shardId, terms); return new AllTermsSingleShardResponse(terms); } finally { searcher.close(); } }
From source file:org.elasticsearch.action.allterms.TransportAllTermsShardAction.java
License:Apache License
private long getDocFreq(List<TermsEnum> termIters, BytesRef lastTerm, String field, int[] exhausted) { long docFreq = 0; if (logger.isTraceEnabled()) { CharsRefBuilder b = new CharsRefBuilder(); b.copyUTF8Bytes(lastTerm); logger.trace("Compute doc freq for {}", b.toString()); }//from www . j a v a 2 s.com for (int i = 0; i < termIters.size(); i++) { if (exhausted[i] == 0) { try { if (logger.isTraceEnabled()) { CharsRefBuilder b = new CharsRefBuilder(); b.copyUTF8Bytes(termIters.get(i).term()); logger.trace("Doc freq on seg {} for term {} is {}", i, b.toString(), termIters.get(i).docFreq()); } if (termIters.get(i).term().compareTo(lastTerm) == 0) { docFreq += termIters.get(i).docFreq(); } } catch (IOException e) { } } } return docFreq; }