List of usage examples for org.apache.lucene.util UnicodeUtil UTF8toUTF16
public static int UTF8toUTF16(BytesRef bytesRef, char[] chars)
From source file:com.github.le11.nls.lucene.TypeAwareSynonymFilter.java
License:Apache License
private void addOutput(BytesRef bytes, int matchInputLength) { bytesReader.reset(bytes.bytes, bytes.offset, bytes.length); final int code = bytesReader.readVInt(); final boolean keepOrig = (code & 0x1) == 0; final int count = code >>> 1; //System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig); for (int outputIDX = 0; outputIDX < count; outputIDX++) { synonyms.words.get(bytesReader.readVInt(), scratchBytes); //System.out.println(" outIDX=" + outputIDX + " bytes=" + scratchBytes.length); UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars); int lastStart = scratchChars.offset; final int chEnd = lastStart + scratchChars.length; int outputUpto = nextRead; for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) { if (chIDX == chEnd || scratchChars.chars[chIDX] == SynonymMap.WORD_SEPARATOR) { final int outputLen = chIDX - lastStart; // Caller is not allowed to have empty string in // the output: assert outputLen > 0 : "output contains empty string: " + scratchChars; futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen); //System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto); lastStart = 1 + chIDX;/*from w ww .j a v a 2 s. c om*/ //System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig); outputUpto = rollIncr(outputUpto); assert futureOutputs[outputUpto].posIncr == 1 : "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite; } } } int upto = nextRead; for (int idx = 0; idx < matchInputLength; idx++) { futureInputs[upto].keepOrig |= keepOrig; futureInputs[upto].matched = true; upto = rollIncr(upto); } }
From source file:com.o19s.solr.swan.highlight.SpanAwareFieldTermStack.java
License:Apache License
/** * a constructor./* w ww. jav a 2 s. c o m*/ * * @param reader IndexReader of the index * @param docId document id to be highlighted * @param fieldName field of the document to be highlighted * @param fieldQuery FieldQuery object * @throws IOException If there is a low-level I/O error */ public SpanAwareFieldTermStack(IndexReader reader, int docId, String fieldName, final SpanAwareFieldQuery fieldQuery) throws IOException { this.fieldName = fieldName; Set<String> termSet = fieldQuery.getTermSet(fieldName); Set<String> alwaysHighlightTermSet = fieldQuery.getHighlightTermSet(fieldName); // just return to make null snippet if un-matched fieldName specified when fieldMatch == true if (termSet == null) return; final Fields vectors = reader.getTermVectors(docId); if (vectors == null) { // null snippet return; } final Terms vector = vectors.terms(fieldName); if (vector == null) { // null snippet return; } final CharsRef spare = new CharsRef(); final TermsEnum termsEnum = vector.iterator(null); DocsAndPositionsEnum dpEnum = null; BytesRef text; int numDocs = reader.maxDoc(); while ((text = termsEnum.next()) != null) { UnicodeUtil.UTF8toUTF16(text, spare); final String term = spare.toString(); if (!termSet.contains(term)) { continue; } dpEnum = termsEnum.docsAndPositions(null, dpEnum); if (dpEnum == null) { // null snippet return; } dpEnum.nextDoc(); // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html final float weight = (float) (Math .log(numDocs / (double) (reader.docFreq(new Term(fieldName, text)) + 1)) + 1.0); final int freq = dpEnum.freq(); for (int i = 0; i < freq; i++) { int pos = dpEnum.nextPosition(); if (dpEnum.startOffset() < 0) { return; // no offsets, null snippet } if (alwaysHighlightTermSet.contains(term) || fieldQuery.doesDocFieldContainPosition(fieldName, docId, dpEnum.startOffset())) { termList.add(new TermInfo(term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight)); } } } // sort by position Collections.sort(termList); }
From source file:io.crate.expression.scalar.regex.RegexMatcher.java
License:Apache License
private static void utf8toUtf16(BytesRef bytes, CharsRef charsRef) { if (charsRef.chars.length < bytes.length) { charsRef.chars = new char[bytes.length]; }/*from w ww . j a v a2s. c om*/ charsRef.length = UnicodeUtil.UTF8toUTF16(bytes, charsRef.chars); }
From source file:io.crate.operation.scalar.regex.RegexMatcher.java
License:Apache License
private static void UTF8toUTF16(BytesRef bytes, CharsRef charsRef) { if (charsRef.chars.length < bytes.length) { charsRef.chars = new char[bytes.length]; }/*from w w w .j av a2s .c om*/ charsRef.length = UnicodeUtil.UTF8toUTF16(bytes, charsRef.chars); }
From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizer.java
License:Apache License
void tokenizeWholeBlock() { queue.clear();// w w w . j av a 2 s .com int nextStart = 0; final int end = block.length(); boolean afterSynonymProduced = false; final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); for (int idx = 0; idx < synonyms.size(); idx++) { final MyToken synonym = synonyms.get(idx); tokenizePartialBlock(nextStart, synonym.startOffset, afterSynonymProduced); // enqueue prev-synonym if (expand) { int limitOffset = 0; if (idx > 0) { limitOffset = synonyms.get(idx - 1).endOffset; } processPrevSynonym(synonym.startOffset, limitOffset); } // enqueue synonyms if (expand) { bytesReader.reset(synonym.output.bytes, synonym.output.offset, synonym.output.length); final int code = bytesReader.readVInt(); // final boolean keepOrig = (code & 0x1) == 0; // not used final int count = code >>> 1; for (int i = 0; i < count; i++) { map.words.get(bytesReader.readVInt(), scratchBytes); if (scratchChars.chars.length < scratchBytes.length) { scratchChars.chars = new char[scratchBytes.length]; } scratchChars.length = UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars.chars); final String word = scratchChars.toString(); int posInc = 0, seq = i + 1; if (synonym.word.equals(word)) { posInc = 1; seq = 0; } queue.add(new MyToken(word, synonym.startOffset, synonym.endOffset, posInc, seq)); } } else { queue.add(synonym); } // enqueue after-synonym if (expand) { int limitOffset = block.length(); if (idx < synonyms.size() - 1) { limitOffset = synonyms.get(idx + 1).startOffset; } afterSynonymProduced = processAfterSynonym(synonym.endOffset, limitOffset); } nextStart = synonym.endOffset; } tokenizePartialBlock(nextStart, end, afterSynonymProduced); }
From source file:org.apache.solr.handler.admin.LukeRequestHandler.java
License:Apache License
private static SimpleOrderedMap<Object> getDocumentFieldsInfo(Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException { final CharsRef spare = new CharsRef(); SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>(); for (Object o : doc.getFields()) { Field field = (Field) o; SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>(); SchemaField sfield = schema.getFieldOrNull(field.name()); FieldType ftype = (sfield == null) ? null : sfield.getType(); f.add("type", (ftype == null) ? null : ftype.getTypeName()); f.add("schema", getFieldFlags(sfield)); f.add("flags", getFieldFlags(field)); Term t = new Term(field.name(), ftype != null ? ftype.storedToIndexed(field) : field.stringValue()); f.add("value", (ftype == null) ? null : ftype.toExternal(field)); // TODO: this really should be "stored" f.add("internal", field.stringValue()); // may be a binary number BytesRef bytes = field.binaryValue(); if (bytes != null) { f.add("binary", Base64.byteArrayToBase64(bytes.bytes, bytes.offset, bytes.length)); }//from w w w .j a va2 s . co m f.add("boost", field.boost()); f.add("docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this can be 0 for non-indexed fields // If we have a term vector, return that if (field.fieldType().storeTermVectors()) { try { Terms v = reader.getTermVector(docId, field.name()); if (v != null) { SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>(); final TermsEnum termsEnum = v.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { final int freq = (int) termsEnum.totalTermFreq(); UnicodeUtil.UTF8toUTF16(text, spare); tfv.add(spare.toString(), freq); } f.add("termVector", tfv); } } catch (Exception ex) { log.warn("error writing term vector", ex); } } finfo.add(field.name(), f); } return finfo; }
From source file:org.apache.solr.handler.admin.LukeRequestHandler.java
License:Apache License
@SuppressWarnings("unchecked") private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap) throws IOException { SolrParams params = req.getParams(); final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT); TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to collect the top N terms in. final CharsRef spare = new CharsRef(); Fields fields = MultiFields.getFields(req.getSearcher().getIndexReader()); if (fields == null) { // No indexed fields return;//from w ww . j a v a 2 s . co m } Terms terms = fields.terms(field); if (terms == null) { // No terms in the field. return; } TermsEnum termsEnum = terms.iterator(null); BytesRef text; int[] buckets = new int[HIST_ARRAY_SIZE]; while ((text = termsEnum.next()) != null) { ++tiq.distinctTerms; int freq = termsEnum.docFreq(); // This calculation seems odd, but it gives the same results as it used to. int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1)); buckets[slot] = buckets[slot] + 1; if (numTerms > 0 && freq > tiq.minFreq) { UnicodeUtil.UTF8toUTF16(text, spare); String t = spare.toString(); tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq())); if (tiq.size() > numTerms) { // if tiq full tiq.pop(); // remove lowest in tiq tiq.minFreq = tiq.getTopTermInfo().docFreq; } } } tiq.histogram.add(buckets); fieldMap.add("distinct", tiq.distinctTerms); // Include top terms fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema())); // Add a histogram fieldMap.add("histogram", tiq.histogram.toNamedList()); }
From source file:org.apache.solr.handler.component.HelloHandlerComponent.java
License:Apache License
protected void doFieldSortValues(ResponseBuilder rb, SolrIndexSearcher searcher) throws IOException { SolrQueryRequest req = rb.req;/*from www.j a v a2 s .c o m*/ SolrQueryResponse rsp = rb.rsp; final CharsRef spare = new CharsRef(); // The query cache doesn't currently store sort field values, and SolrIndexSearcher doesn't // currently have an option to return sort field values. Because of this, we // take the documents given and re-derive the sort values. boolean fsv = req.getParams().getBool(ResponseBuilder.FIELD_SORT_VALUES, false); if (fsv) { Sort sort = searcher.weightSort(rb.getSortSpec().getSort()); SortField[] sortFields = sort == null ? new SortField[] { SortField.FIELD_SCORE } : sort.getSort(); NamedList<Object[]> sortVals = new NamedList<Object[]>(); // order is important for the sort fields Field field = new StringField("dummy", "", Field.Store.NO); // a dummy Field IndexReaderContext topReaderContext = searcher.getTopReaderContext(); List<AtomicReaderContext> leaves = topReaderContext.leaves(); AtomicReaderContext currentLeaf = null; if (leaves.size() == 1) { // if there is a single segment, use that subReader and avoid looking up each time currentLeaf = leaves.get(0); leaves = null; } DocList docList = rb.getResults().docList; // sort ids from lowest to highest so we can access them in order int nDocs = docList.size(); long[] sortedIds = new long[nDocs]; DocIterator it = rb.getResults().docList.iterator(); for (int i = 0; i < nDocs; i++) { sortedIds[i] = (((long) it.nextDoc()) << 32) | i; } Arrays.sort(sortedIds); for (SortField sortField : sortFields) { SortField.Type type = sortField.getType(); if (type == SortField.Type.SCORE || type == SortField.Type.DOC) continue; FieldComparator comparator = null; String fieldname = sortField.getField(); FieldType ft = fieldname == null ? null : req.getSchema().getFieldTypeNoEx(fieldname); Object[] vals = new Object[nDocs]; int lastIdx = -1; int idx = 0; for (long idAndPos : sortedIds) { int doc = (int) (idAndPos >>> 32); int position = (int) idAndPos; if (leaves != null) { idx = ReaderUtil.subIndex(doc, leaves); currentLeaf = leaves.get(idx); if (idx != lastIdx) { // we switched segments. invalidate comparator. comparator = null; } } if (comparator == null) { comparator = sortField.getComparator(1, 0); comparator = comparator.setNextReader(currentLeaf); } doc -= currentLeaf.docBase; // adjust for what segment this is in comparator.copy(0, doc); Object val = comparator.value(0); // Sortable float, double, int, long types all just use a string // comparator. For these, we need to put the type into a readable // format. One reason for this is that XML can't represent all // string values (or even all unicode code points). // indexedToReadable() should be a no-op and should // thus be harmless anyway (for all current ways anyway) if (val instanceof String) { field.setStringValue((String) val); val = ft.toObject(field); } // Must do the same conversion when sorting by a // String field in Lucene, which returns the terms // data as BytesRef: if (val instanceof BytesRef) { UnicodeUtil.UTF8toUTF16((BytesRef) val, spare); field.setStringValue(spare.toString()); val = ft.toObject(field); } vals[position] = val; } sortVals.add(fieldname, vals); } rsp.add("sort_values", sortVals); } }
From source file:org.apache.solr.handler.component.QueryComponent.java
License:Apache License
protected void doFieldSortValues(ResponseBuilder rb, SolrIndexSearcher searcher) throws IOException { SolrQueryRequest req = rb.req;//w ww. j a v a 2 s. com SolrQueryResponse rsp = rb.rsp; final CharsRef spare = new CharsRef(); // The query cache doesn't currently store sort field values, and SolrIndexSearcher doesn't // currently have an option to return sort field values. Because of this, we // take the documents given and re-derive the sort values. boolean fsv = req.getParams().getBool(ResponseBuilder.FIELD_SORT_VALUES, false); if (fsv) { Sort sort = searcher.weightSort(rb.getSortSpec().getSort()); SortField[] sortFields = sort == null ? new SortField[] { SortField.FIELD_SCORE } : sort.getSort(); NamedList<Object[]> sortVals = new NamedList<Object[]>(); // order is important for the sort fields Field field = new StringField("dummy", "", Field.Store.NO); // a dummy Field IndexReaderContext topReaderContext = searcher.getTopReaderContext(); List<AtomicReaderContext> leaves = topReaderContext.leaves(); AtomicReaderContext currentLeaf = null; if (leaves.size() == 1) { // if there is a single segment, use that subReader and avoid looking up each time currentLeaf = leaves.get(0); leaves = null; } DocList docList = rb.getResults().docList; // sort ids from lowest to highest so we can access them in order int nDocs = docList.size(); long[] sortedIds = new long[nDocs]; DocIterator it = rb.getResults().docList.iterator(); for (int i = 0; i < nDocs; i++) { sortedIds[i] = (((long) it.nextDoc()) << 32) | i; } Arrays.sort(sortedIds); for (SortField sortField : sortFields) { SortField.Type type = sortField.getType(); if (type == SortField.Type.SCORE || type == SortField.Type.DOC) continue; FieldComparator comparator = null; String fieldname = sortField.getField(); FieldType ft = fieldname == null ? null : searcher.getSchema().getFieldTypeNoEx(fieldname); Object[] vals = new Object[nDocs]; int lastIdx = -1; int idx = 0; for (long idAndPos : sortedIds) { int doc = (int) (idAndPos >>> 32); int position = (int) idAndPos; if (leaves != null) { idx = ReaderUtil.subIndex(doc, leaves); currentLeaf = leaves.get(idx); if (idx != lastIdx) { // we switched segments. invalidate comparator. comparator = null; } } if (comparator == null) { comparator = sortField.getComparator(1, 0); comparator = comparator.setNextReader(currentLeaf); } doc -= currentLeaf.docBase; // adjust for what segment this is in comparator.copy(0, doc); Object val = comparator.value(0); // Sortable float, double, int, long types all just use a string // comparator. For these, we need to put the type into a readable // format. One reason for this is that XML can't represent all // string values (or even all unicode code points). // indexedToReadable() should be a no-op and should // thus be harmless anyway (for all current ways anyway) if (val instanceof String) { field.setStringValue((String) val); val = ft.toObject(field); } // Must do the same conversion when sorting by a // String field in Lucene, which returns the terms // data as BytesRef: if (val instanceof BytesRef) { UnicodeUtil.UTF8toUTF16((BytesRef) val, spare); field.setStringValue(spare.toString()); val = ft.toObject(field); } vals[position] = val; } sortVals.add(fieldname, vals); } rsp.add("sort_values", sortVals); } }
From source file:org.apache.solr.request.PerSegmentSingleValuedFaceting.java
License:Apache License
@Override public boolean collect(BytesRef term, int count) { if (count > min) { // NOTE: we use c>min rather than c>=min as an optimization because we are going in // index order, so we already know that the keys are ordered. This can be very // important if a lot of the counts are repeated (like zero counts would be). UnicodeUtil.UTF8toUTF16(term, spare); queue.add(new SimpleFacets.CountPair<String, Integer>(spare.toString(), count)); if (queue.size() >= maxsize) min = queue.last().val; }//from w ww .j ava 2 s . com return false; }