Example usage for org.apache.lucene.util UnicodeUtil UTF8toUTF16

List of usage examples for org.apache.lucene.util UnicodeUtil UTF8toUTF16

Introduction

In this page you can find the example usage for org.apache.lucene.util UnicodeUtil UTF8toUTF16.

Prototype

public static int UTF8toUTF16(BytesRef bytesRef, char[] chars) 

Source Link

Document

Utility method for #UTF8toUTF16(byte[],int,int,char[])

Usage

From source file:com.github.le11.nls.lucene.TypeAwareSynonymFilter.java

License:Apache License

private void addOutput(BytesRef bytes, int matchInputLength) {
    bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);

    final int code = bytesReader.readVInt();
    final boolean keepOrig = (code & 0x1) == 0;
    final int count = code >>> 1;
    //System.out.println("  addOutput count=" + count + " keepOrig=" + keepOrig);
    for (int outputIDX = 0; outputIDX < count; outputIDX++) {
        synonyms.words.get(bytesReader.readVInt(), scratchBytes);
        //System.out.println("    outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
        UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);
        int lastStart = scratchChars.offset;
        final int chEnd = lastStart + scratchChars.length;
        int outputUpto = nextRead;
        for (int chIDX = lastStart; chIDX <= chEnd; chIDX++) {
            if (chIDX == chEnd || scratchChars.chars[chIDX] == SynonymMap.WORD_SEPARATOR) {
                final int outputLen = chIDX - lastStart;
                // Caller is not allowed to have empty string in
                // the output:
                assert outputLen > 0 : "output contains empty string: " + scratchChars;
                futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
                //System.out.println("      " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
                lastStart = 1 + chIDX;/*from  w  ww  .j a  v a  2 s.  c  om*/
                //System.out.println("  slot=" + outputUpto + " keepOrig=" + keepOrig);
                outputUpto = rollIncr(outputUpto);
                assert futureOutputs[outputUpto].posIncr == 1 : "outputUpto=" + outputUpto + " vs nextWrite="
                        + nextWrite;
            }
        }
    }

    int upto = nextRead;
    for (int idx = 0; idx < matchInputLength; idx++) {
        futureInputs[upto].keepOrig |= keepOrig;
        futureInputs[upto].matched = true;
        upto = rollIncr(upto);
    }
}

From source file:com.o19s.solr.swan.highlight.SpanAwareFieldTermStack.java

License:Apache License

/**
 * a constructor./* w  ww. jav  a  2 s.  c  o m*/
 * 
 * @param reader IndexReader of the index
 * @param docId document id to be highlighted
 * @param fieldName field of the document to be highlighted
 * @param fieldQuery FieldQuery object
 * @throws IOException If there is a low-level I/O error
 */
public SpanAwareFieldTermStack(IndexReader reader, int docId, String fieldName,
        final SpanAwareFieldQuery fieldQuery) throws IOException {
    this.fieldName = fieldName;

    Set<String> termSet = fieldQuery.getTermSet(fieldName);
    Set<String> alwaysHighlightTermSet = fieldQuery.getHighlightTermSet(fieldName);

    // just return to make null snippet if un-matched fieldName specified when fieldMatch == true
    if (termSet == null)
        return;

    final Fields vectors = reader.getTermVectors(docId);
    if (vectors == null) {
        // null snippet
        return;
    }

    final Terms vector = vectors.terms(fieldName);
    if (vector == null) {
        // null snippet
        return;
    }

    final CharsRef spare = new CharsRef();
    final TermsEnum termsEnum = vector.iterator(null);
    DocsAndPositionsEnum dpEnum = null;
    BytesRef text;

    int numDocs = reader.maxDoc();
    while ((text = termsEnum.next()) != null) {
        UnicodeUtil.UTF8toUTF16(text, spare);
        final String term = spare.toString();
        if (!termSet.contains(term)) {
            continue;
        }
        dpEnum = termsEnum.docsAndPositions(null, dpEnum);
        if (dpEnum == null) {
            // null snippet
            return;
        }

        dpEnum.nextDoc();

        // For weight look here: http://lucene.apache.org/core/3_6_0/api/core/org/apache/lucene/search/DefaultSimilarity.html
        final float weight = (float) (Math
                .log(numDocs / (double) (reader.docFreq(new Term(fieldName, text)) + 1)) + 1.0);

        final int freq = dpEnum.freq();

        for (int i = 0; i < freq; i++) {
            int pos = dpEnum.nextPosition();
            if (dpEnum.startOffset() < 0) {
                return; // no offsets, null snippet
            }

            if (alwaysHighlightTermSet.contains(term)
                    || fieldQuery.doesDocFieldContainPosition(fieldName, docId, dpEnum.startOffset())) {
                termList.add(new TermInfo(term, dpEnum.startOffset(), dpEnum.endOffset(), pos, weight));
            }
        }

    }

    // sort by position
    Collections.sort(termList);
}

From source file:io.crate.expression.scalar.regex.RegexMatcher.java

License:Apache License

private static void utf8toUtf16(BytesRef bytes, CharsRef charsRef) {
    if (charsRef.chars.length < bytes.length) {
        charsRef.chars = new char[bytes.length];
    }/*from  w  ww  .  j a  v  a2s. c om*/
    charsRef.length = UnicodeUtil.UTF8toUTF16(bytes, charsRef.chars);
}

From source file:io.crate.operation.scalar.regex.RegexMatcher.java

License:Apache License

private static void UTF8toUTF16(BytesRef bytes, CharsRef charsRef) {
    if (charsRef.chars.length < bytes.length) {
        charsRef.chars = new char[bytes.length];
    }/*from w w w .j  av a2s .c om*/
    charsRef.length = UnicodeUtil.UTF8toUTF16(bytes, charsRef.chars);
}

From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizer.java

License:Apache License

void tokenizeWholeBlock() {
    queue.clear();//  w  w w  .  j  av a 2  s .com
    int nextStart = 0;
    final int end = block.length();
    boolean afterSynonymProduced = false;
    final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
    for (int idx = 0; idx < synonyms.size(); idx++) {
        final MyToken synonym = synonyms.get(idx);
        tokenizePartialBlock(nextStart, synonym.startOffset, afterSynonymProduced);

        // enqueue prev-synonym
        if (expand) {
            int limitOffset = 0;
            if (idx > 0) {
                limitOffset = synonyms.get(idx - 1).endOffset;
            }
            processPrevSynonym(synonym.startOffset, limitOffset);
        }

        // enqueue synonyms
        if (expand) {
            bytesReader.reset(synonym.output.bytes, synonym.output.offset, synonym.output.length);
            final int code = bytesReader.readVInt();
            // final boolean keepOrig = (code & 0x1) == 0; // not used
            final int count = code >>> 1;
            for (int i = 0; i < count; i++) {
                map.words.get(bytesReader.readVInt(), scratchBytes);
                if (scratchChars.chars.length < scratchBytes.length) {
                    scratchChars.chars = new char[scratchBytes.length];
                }
                scratchChars.length = UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars.chars);
                final String word = scratchChars.toString();
                int posInc = 0, seq = i + 1;
                if (synonym.word.equals(word)) {
                    posInc = 1;
                    seq = 0;
                }
                queue.add(new MyToken(word, synonym.startOffset, synonym.endOffset, posInc, seq));
            }
        } else {
            queue.add(synonym);
        }

        // enqueue after-synonym
        if (expand) {
            int limitOffset = block.length();
            if (idx < synonyms.size() - 1) {
                limitOffset = synonyms.get(idx + 1).startOffset;
            }
            afterSynonymProduced = processAfterSynonym(synonym.endOffset, limitOffset);
        }

        nextStart = synonym.endOffset;
    }
    tokenizePartialBlock(nextStart, end, afterSynonymProduced);
}

From source file:org.apache.solr.handler.admin.LukeRequestHandler.java

License:Apache License

private static SimpleOrderedMap<Object> getDocumentFieldsInfo(Document doc, int docId, IndexReader reader,
        IndexSchema schema) throws IOException {
    final CharsRef spare = new CharsRef();
    SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>();
    for (Object o : doc.getFields()) {
        Field field = (Field) o;
        SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>();

        SchemaField sfield = schema.getFieldOrNull(field.name());
        FieldType ftype = (sfield == null) ? null : sfield.getType();

        f.add("type", (ftype == null) ? null : ftype.getTypeName());
        f.add("schema", getFieldFlags(sfield));
        f.add("flags", getFieldFlags(field));

        Term t = new Term(field.name(), ftype != null ? ftype.storedToIndexed(field) : field.stringValue());

        f.add("value", (ftype == null) ? null : ftype.toExternal(field));

        // TODO: this really should be "stored"
        f.add("internal", field.stringValue()); // may be a binary number

        BytesRef bytes = field.binaryValue();
        if (bytes != null) {
            f.add("binary", Base64.byteArrayToBase64(bytes.bytes, bytes.offset, bytes.length));
        }//from  w  w w .j a va2 s  . co  m
        f.add("boost", field.boost());
        f.add("docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this can be 0 for non-indexed fields

        // If we have a term vector, return that
        if (field.fieldType().storeTermVectors()) {
            try {
                Terms v = reader.getTermVector(docId, field.name());
                if (v != null) {
                    SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>();
                    final TermsEnum termsEnum = v.iterator(null);
                    BytesRef text;
                    while ((text = termsEnum.next()) != null) {
                        final int freq = (int) termsEnum.totalTermFreq();
                        UnicodeUtil.UTF8toUTF16(text, spare);
                        tfv.add(spare.toString(), freq);
                    }
                    f.add("termVector", tfv);
                }
            } catch (Exception ex) {
                log.warn("error writing term vector", ex);
            }
        }

        finfo.add(field.name(), f);
    }
    return finfo;
}

From source file:org.apache.solr.handler.admin.LukeRequestHandler.java

License:Apache License

@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap)
        throws IOException {

    SolrParams params = req.getParams();
    final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);

    TopTermQueue tiq = new TopTermQueue(numTerms + 1); // Something to collect the top N terms in.

    final CharsRef spare = new CharsRef();

    Fields fields = MultiFields.getFields(req.getSearcher().getIndexReader());

    if (fields == null) { // No indexed fields
        return;//from  w  ww  .  j a v  a 2 s .  co  m
    }

    Terms terms = fields.terms(field);
    if (terms == null) { // No terms in the field.
        return;
    }
    TermsEnum termsEnum = terms.iterator(null);
    BytesRef text;
    int[] buckets = new int[HIST_ARRAY_SIZE];
    while ((text = termsEnum.next()) != null) {
        ++tiq.distinctTerms;
        int freq = termsEnum.docFreq(); // This calculation seems odd, but it gives the same results as it used to.
        int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
        buckets[slot] = buckets[slot] + 1;
        if (numTerms > 0 && freq > tiq.minFreq) {
            UnicodeUtil.UTF8toUTF16(text, spare);
            String t = spare.toString();

            tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq()));
            if (tiq.size() > numTerms) { // if tiq full
                tiq.pop(); // remove lowest in tiq
                tiq.minFreq = tiq.getTopTermInfo().docFreq;
            }
        }
    }
    tiq.histogram.add(buckets);
    fieldMap.add("distinct", tiq.distinctTerms);

    // Include top terms
    fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));

    // Add a histogram
    fieldMap.add("histogram", tiq.histogram.toNamedList());
}

From source file:org.apache.solr.handler.component.HelloHandlerComponent.java

License:Apache License

protected void doFieldSortValues(ResponseBuilder rb, SolrIndexSearcher searcher) throws IOException {
    SolrQueryRequest req = rb.req;/*from   www.j  a  v  a2 s  .c o m*/
    SolrQueryResponse rsp = rb.rsp;
    final CharsRef spare = new CharsRef();
    // The query cache doesn't currently store sort field values, and SolrIndexSearcher doesn't
    // currently have an option to return sort field values.  Because of this, we
    // take the documents given and re-derive the sort values.
    boolean fsv = req.getParams().getBool(ResponseBuilder.FIELD_SORT_VALUES, false);
    if (fsv) {
        Sort sort = searcher.weightSort(rb.getSortSpec().getSort());
        SortField[] sortFields = sort == null ? new SortField[] { SortField.FIELD_SCORE } : sort.getSort();
        NamedList<Object[]> sortVals = new NamedList<Object[]>(); // order is important for the sort fields
        Field field = new StringField("dummy", "", Field.Store.NO); // a dummy Field
        IndexReaderContext topReaderContext = searcher.getTopReaderContext();
        List<AtomicReaderContext> leaves = topReaderContext.leaves();
        AtomicReaderContext currentLeaf = null;
        if (leaves.size() == 1) {
            // if there is a single segment, use that subReader and avoid looking up each time
            currentLeaf = leaves.get(0);
            leaves = null;
        }

        DocList docList = rb.getResults().docList;

        // sort ids from lowest to highest so we can access them in order
        int nDocs = docList.size();
        long[] sortedIds = new long[nDocs];
        DocIterator it = rb.getResults().docList.iterator();
        for (int i = 0; i < nDocs; i++) {
            sortedIds[i] = (((long) it.nextDoc()) << 32) | i;
        }
        Arrays.sort(sortedIds);

        for (SortField sortField : sortFields) {
            SortField.Type type = sortField.getType();
            if (type == SortField.Type.SCORE || type == SortField.Type.DOC)
                continue;

            FieldComparator comparator = null;

            String fieldname = sortField.getField();
            FieldType ft = fieldname == null ? null : req.getSchema().getFieldTypeNoEx(fieldname);

            Object[] vals = new Object[nDocs];

            int lastIdx = -1;
            int idx = 0;

            for (long idAndPos : sortedIds) {
                int doc = (int) (idAndPos >>> 32);
                int position = (int) idAndPos;

                if (leaves != null) {
                    idx = ReaderUtil.subIndex(doc, leaves);
                    currentLeaf = leaves.get(idx);
                    if (idx != lastIdx) {
                        // we switched segments.  invalidate comparator.
                        comparator = null;
                    }
                }

                if (comparator == null) {
                    comparator = sortField.getComparator(1, 0);
                    comparator = comparator.setNextReader(currentLeaf);
                }

                doc -= currentLeaf.docBase; // adjust for what segment this is in
                comparator.copy(0, doc);
                Object val = comparator.value(0);

                // Sortable float, double, int, long types all just use a string
                // comparator. For these, we need to put the type into a readable
                // format.  One reason for this is that XML can't represent all
                // string values (or even all unicode code points).
                // indexedToReadable() should be a no-op and should
                // thus be harmless anyway (for all current ways anyway)
                if (val instanceof String) {
                    field.setStringValue((String) val);
                    val = ft.toObject(field);
                }

                // Must do the same conversion when sorting by a
                // String field in Lucene, which returns the terms
                // data as BytesRef:
                if (val instanceof BytesRef) {
                    UnicodeUtil.UTF8toUTF16((BytesRef) val, spare);
                    field.setStringValue(spare.toString());
                    val = ft.toObject(field);
                }

                vals[position] = val;
            }

            sortVals.add(fieldname, vals);
        }

        rsp.add("sort_values", sortVals);
    }
}

From source file:org.apache.solr.handler.component.QueryComponent.java

License:Apache License

protected void doFieldSortValues(ResponseBuilder rb, SolrIndexSearcher searcher) throws IOException {
    SolrQueryRequest req = rb.req;//w  ww. j  a v a  2  s. com
    SolrQueryResponse rsp = rb.rsp;
    final CharsRef spare = new CharsRef();
    // The query cache doesn't currently store sort field values, and SolrIndexSearcher doesn't
    // currently have an option to return sort field values.  Because of this, we
    // take the documents given and re-derive the sort values.
    boolean fsv = req.getParams().getBool(ResponseBuilder.FIELD_SORT_VALUES, false);
    if (fsv) {
        Sort sort = searcher.weightSort(rb.getSortSpec().getSort());
        SortField[] sortFields = sort == null ? new SortField[] { SortField.FIELD_SCORE } : sort.getSort();
        NamedList<Object[]> sortVals = new NamedList<Object[]>(); // order is important for the sort fields
        Field field = new StringField("dummy", "", Field.Store.NO); // a dummy Field
        IndexReaderContext topReaderContext = searcher.getTopReaderContext();
        List<AtomicReaderContext> leaves = topReaderContext.leaves();
        AtomicReaderContext currentLeaf = null;
        if (leaves.size() == 1) {
            // if there is a single segment, use that subReader and avoid looking up each time
            currentLeaf = leaves.get(0);
            leaves = null;
        }

        DocList docList = rb.getResults().docList;

        // sort ids from lowest to highest so we can access them in order
        int nDocs = docList.size();
        long[] sortedIds = new long[nDocs];
        DocIterator it = rb.getResults().docList.iterator();
        for (int i = 0; i < nDocs; i++) {
            sortedIds[i] = (((long) it.nextDoc()) << 32) | i;
        }
        Arrays.sort(sortedIds);

        for (SortField sortField : sortFields) {
            SortField.Type type = sortField.getType();
            if (type == SortField.Type.SCORE || type == SortField.Type.DOC)
                continue;

            FieldComparator comparator = null;

            String fieldname = sortField.getField();
            FieldType ft = fieldname == null ? null : searcher.getSchema().getFieldTypeNoEx(fieldname);

            Object[] vals = new Object[nDocs];

            int lastIdx = -1;
            int idx = 0;

            for (long idAndPos : sortedIds) {
                int doc = (int) (idAndPos >>> 32);
                int position = (int) idAndPos;

                if (leaves != null) {
                    idx = ReaderUtil.subIndex(doc, leaves);
                    currentLeaf = leaves.get(idx);
                    if (idx != lastIdx) {
                        // we switched segments.  invalidate comparator.
                        comparator = null;
                    }
                }

                if (comparator == null) {
                    comparator = sortField.getComparator(1, 0);
                    comparator = comparator.setNextReader(currentLeaf);
                }

                doc -= currentLeaf.docBase; // adjust for what segment this is in
                comparator.copy(0, doc);
                Object val = comparator.value(0);

                // Sortable float, double, int, long types all just use a string
                // comparator. For these, we need to put the type into a readable
                // format.  One reason for this is that XML can't represent all
                // string values (or even all unicode code points).
                // indexedToReadable() should be a no-op and should
                // thus be harmless anyway (for all current ways anyway)
                if (val instanceof String) {
                    field.setStringValue((String) val);
                    val = ft.toObject(field);
                }

                // Must do the same conversion when sorting by a
                // String field in Lucene, which returns the terms
                // data as BytesRef:
                if (val instanceof BytesRef) {
                    UnicodeUtil.UTF8toUTF16((BytesRef) val, spare);
                    field.setStringValue(spare.toString());
                    val = ft.toObject(field);
                }

                vals[position] = val;
            }

            sortVals.add(fieldname, vals);
        }

        rsp.add("sort_values", sortVals);
    }
}

From source file:org.apache.solr.request.PerSegmentSingleValuedFaceting.java

License:Apache License

@Override
public boolean collect(BytesRef term, int count) {
    if (count > min) {
        // NOTE: we use c>min rather than c>=min as an optimization because we are going in
        // index order, so we already know that the keys are ordered.  This can be very
        // important if a lot of the counts are repeated (like zero counts would be).
        UnicodeUtil.UTF8toUTF16(term, spare);
        queue.add(new SimpleFacets.CountPair<String, Integer>(spare.toString(), count));
        if (queue.size() >= maxsize)
            min = queue.last().val;
    }//from w ww  .j  ava 2  s . com
    return false;
}