List of usage examples for org.apache.lucene.util CharsRefBuilder CharsRefBuilder
public CharsRefBuilder()
From source file:alix.lucene.MoreLikeThis.java
License:Apache License
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field *///from w w w . ja va 2 s . c o m private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } final int freq = (int) termsEnum.totalTermFreq(); // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
From source file:alix.lucene.MoreLikeThis.java
License:Apache License
/** * Print a term vector for debugging/*from w w w. j a va 2s . c om*/ * * @param vector List of terms and their frequencies for a doc/field * @throws IOException */ @SuppressWarnings("unused") private void print(Terms vector) throws IOException { if (vector == null) return; final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; // termsEnum.docFreq() = 1, // The returned Fields instance acts like a single-document inverted index HashMap<String, Long> map = new HashMap<String, Long>(); while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); map.put(spare.toString(), termsEnum.totalTermFreq()); } @SuppressWarnings("unchecked") Map.Entry<String, Long>[] a = map.entrySet().toArray(new Map.Entry[0]); Arrays.sort(a, new Comparator<Map.Entry<String, Long>>() { public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) { return o2.getValue().compareTo(o1.getValue()); } }); for (Map.Entry<String, Long> e : a) { System.out.print(e.getKey() + ":" + e.getValue() + " "); } System.out.println(); }
From source file:com.nlp.mlt.SimQuery.java
License:Apache License
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field *///w w w . ja va 2s . com private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } final int freq = (int) termsEnum.totalTermFreq(); System.out.println(term + " = " + freq); // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
From source file:com.shaie.fst.FstExample.java
License:Apache License
public static void main(String[] args) throws Exception { final CharsRef output = new CharsRef("color"); final SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.add(SynonymMap.Builder.join("blue".split(" "), new CharsRefBuilder()), output, true); builder.add(SynonymMap.Builder.join("green".split(" "), new CharsRefBuilder()), output, true); builder.add(SynonymMap.Builder.join("pale green".split(" "), new CharsRefBuilder()), output, true); builder.add(SynonymMap.Builder.join("pale blue".split(" "), new CharsRefBuilder()), output, true); builder.add(SynonymMap.Builder.join("dark sea green".split(" "), new CharsRefBuilder()), output, true); final SynonymMap synMap = builder.build(); try (PrintWriter pw = new PrintWriter("d:/tmp/syns.dot");) { Util.toDot(synMap.fst, pw, true, true); }/*w w w .j a v a 2s . c o m*/ System.out.println("Done!"); }
From source file:com.shaie.SynonymFilterExample.java
License:Apache License
private static void addSynonym(String input, String output, SynonymMap.Builder builder) { final CharsRef inputWords = SynonymMap.Builder.join(input.split(" "), new CharsRefBuilder()); final CharsRef outputWords = SynonymMap.Builder.join(output.split(" "), new CharsRefBuilder()); builder.add(inputWords, outputWords, true); }
From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java
License:Open Source License
private static SimpleOrderedMap<Object> getDocumentFieldsInfo(Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException { final CharsRefBuilder spare = new CharsRefBuilder(); SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<>(); for (Object o : doc.getFields()) { Field field = (Field) o; SimpleOrderedMap<Object> f = new SimpleOrderedMap<>(); SchemaField sfield = schema.getFieldOrNull(field.name()); FieldType ftype = (sfield == null) ? null : sfield.getType(); f.add("type", (ftype == null) ? null : ftype.getTypeName()); f.add("schema", getFieldFlags(sfield)); f.add("flags", getFieldFlags(field)); Term t = new Term(field.name(), ftype != null ? ftype.storedToIndexed(field) : field.stringValue()); f.add("value", (ftype == null) ? null : ftype.toExternal(field)); // TODO: this really should be "stored" f.add("internal", field.stringValue()); // may be a binary number BytesRef bytes = field.binaryValue(); if (bytes != null) { f.add("binary", Base64.byteArrayToBase64(bytes.bytes, bytes.offset, bytes.length)); }//from www. ja va 2 s. c o m f.add("boost", field.boost()); f.add("docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this // can // be 0 // for // non-indexed // fields // If we have a term vector, return that if (field.fieldType().storeTermVectors()) { try { Terms v = reader.getTermVector(docId, field.name()); if (v != null) { SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<>(); final TermsEnum termsEnum = v.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { final int freq = (int) termsEnum.totalTermFreq(); spare.copyUTF8Bytes(text); tfv.add(spare.toString(), freq); } f.add("termVector", tfv); } } catch (Exception ex) { log.warn("error writing term vector", ex); } } finfo.add(field.name(), f); } return finfo; }
From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java
License:Open Source License
@SuppressWarnings("unchecked") private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap) throws IOException { SolrParams params = req.getParams(); final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT); TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to // collect the top N // terms in./* w w w. j a v a 2 s .c o m*/ final CharsRefBuilder spare = new CharsRefBuilder(); Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(), field); if (terms == null) { // field does not exist return; } TermsEnum termsEnum = terms.iterator(); BytesRef text; int[] buckets = new int[HIST_ARRAY_SIZE]; while ((text = termsEnum.next()) != null) { ++tiq.distinctTerms; int freq = termsEnum.docFreq(); // This calculation seems odd, but // it gives the same results as it // used to. int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1)); buckets[slot] = buckets[slot] + 1; if (numTerms > 0 && freq > tiq.minFreq) { spare.copyUTF8Bytes(text); String t = spare.toString(); tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq())); if (tiq.size() > numTerms) { // if tiq full tiq.pop(); // remove lowest in tiq tiq.minFreq = tiq.getTopTermInfo().docFreq; } } } tiq.histogram.add(buckets); fieldMap.add("distinct", tiq.distinctTerms); // Include top terms fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema())); // Add a histogram fieldMap.add("histogram", tiq.histogram.toNamedList()); }
From source file:org.apache.solr.search.join.BlockJoinFieldFacetAccumulator.java
License:Apache License
/** copy paste from {@link DocValuesFacets} */ NamedList<Integer> getFacetValue() throws IOException { NamedList<Integer> facetValue = new NamedList<>(); final CharsRefBuilder charsRef = new CharsRefBuilder(); // if there is no globs, take segment's ones for (int i = 1; i < (globalCounts != null ? globalCounts.length : segmentAccums.length); i++) { int count = globalCounts != null ? globalCounts[i] : (int) (segmentAccums[i] >> 32); if (count > 0) { BytesRef term = topSSDV.lookupOrd(-1 + i); fieldType.indexedToReadable(term, charsRef); facetValue.add(charsRef.toString(), count); }// ww w . j av a2s . c o m } return facetValue; }
From source file:org.apache.solr.update.processor.TolerantUpdateProcessor.java
License:Apache License
/** * Returns the output of {@link org.apache.solr.schema.FieldType# * indexedToReadable(BytesRef, CharsRefBuilder)} of the field * type of the uniqueKey on the {@link BytesRef} passed as parameter. * <code>ref</code> should be the indexed representation of the id -- if null * (possibly because it's missing in the update) this method will return {@link #UNKNOWN_ID} *//*from ww w . j a va 2 s. c o m*/ private String getPrintableId(BytesRef ref) { if (ref == null) { return UNKNOWN_ID; } return uniqueKeyField.getType().indexedToReadable(ref, new CharsRefBuilder()).toString(); }
From source file:org.codelibs.elasticsearch.common.lucene.search.XMoreLikeThis.java
License:Apache License
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field * @param fieldName Optional field name of the terms for skip terms *//* w w w . j ava 2 s.c o m*/ private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } if (isSkipTerm(fieldName, term)) { continue; } final PostingsEnum docs = termsEnum.postings(null); int freq = 0; while (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { freq += docs.freq(); } // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }