Example usage for org.apache.lucene.util BytesRef deepCopyOf

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef deepCopyOf.

Prototype

public static BytesRef deepCopyOf(BytesRef other)

Source Link

Document

Creates a new BytesRef that points to a copy of the bytes from other

The returned BytesRef will have a length of other.length and an offset of zero.

Usage

From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java

License:Open Source License

/**
 * Tokenizes a query string using Lucenes analyzer. This also removes
 * stopwords from the query string. Returns a mapping of query-term to
 * in-query-frequency. The {@link IndexDataProvider} instance is used to skip
 * terms no found in the collection.//from   ww  w  .ja  v a2 s  .  co m
 *
 * @param query Query String
 * @param qAnalyzer Analyzer used to parse the query String
 * @param dataProv IndexDataProvider
 * @return mapping of query-term to in-query-frequency with optionally terms
 * not in the collection skipped
 */
@SuppressWarnings("ObjectAllocationInLoop")
public static Map<BytesRef, Integer> tokenizeAndMapQuery(@NotNull final String query,
        @NotNull final Analyzer qAnalyzer, @Nullable final IndexDataProvider dataProv) {
    // estimate size
    final Map<BytesRef, Integer> result = new HashMap<>(
            (int) ((double) StringUtils.estimatedWordCount(query) * 1.8));
    try (TokenStream stream = qAnalyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (result.containsKey(term)) {
                result.put(BytesRef.deepCopyOf(term), result.get(term) + 1);
            } else {
                result.put(BytesRef.deepCopyOf(term), 1);
            }
        }
    } catch (final IOException e) {
        // not thrown b/c we're using a string reader
    }
    if (dataProv != null) {
        removeUnknownTerms(dataProv, result.keySet()).stream().forEach(result::remove);
    }
    return result;
}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.lucene.TermStats.java

License:Open Source License

TermStats(String field, BytesRef termtext, int df) {
    this.termtext = BytesRef.deepCopyOf(termtext);
    this.field = field;
    this.docFreq = df;
}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.lucene.TermStats.java

License:Open Source License

TermStats(String field, BytesRef termtext, int df, long tf) {
    this.termtext = BytesRef.deepCopyOf(termtext);
    this.field = field;
    this.docFreq = df;
    this.totalTermFreq = tf;
}

From source file:edu.utsa.sifter.Result.java

License:Apache License

public DocTermInfo docRankFactors(final double[] features, final Date refDate, final IndexReader rdr,
        final Set<Term> termSet) throws IOException {
    // J.S./* ww  w.j a  va  2 s  . c o  m*/
    final double[] featuresA = new double[19];
    final DocTermInfo ret = new DocTermInfo();
    final String lowerExt = Extension.toLowerCase();
    if (!isUnallocated()) {
        features[HitRanker.FCREATED] = dateDiff(Created, refDate);
        features[HitRanker.FMODIFIED] = dateDiff(Modified, refDate);
        features[HitRanker.FACCESSED] = dateDiff(Accessed, refDate);
        features[HitRanker.FAVG_RECENCY] = (features[HitRanker.FCREATED] + features[HitRanker.FMODIFIED]
                + features[HitRanker.FACCESSED]) / 3;
        features[HitRanker.FFILENAME_DIRECT] = 0;
        features[HitRanker.FFILENAME_INDIRECT] = 0;
        final String fullPath = Path + Name;
        for (Term t : termSet) {
            if (fullPath.indexOf(t.text()) > 0) {
                features[HitRanker.FFILENAME_INDIRECT] = 1;
                break;
            }
        }
        features[HitRanker.FUSER_DIRECTORY] = 0;
        for (String dir : SystemDirs) {
            if (Path.indexOf(dir) > -1) {
                features[HitRanker.FUSER_DIRECTORY] = 1;
                break;
            }
        }
    }
    features[HitRanker.FHIGH_PRIORITY_TYPE] = DocMaker.HighPriorityTypes.contains(lowerExt) ? 1 : 0;
    features[HitRanker.FMED_PRIORITY_TYPE] = DocMaker.MedPriorityTypes.contains(lowerExt) ? 1 : 0;
    features[HitRanker.FLOW_PRIORITY_TYPE] = features[HitRanker.FHIGH_PRIORITY_TYPE]
            + features[HitRanker.FMED_PRIORITY_TYPE] > 0 ? 0 : 1;

    final Terms terms = rdr.getTermVector(LuceneID, "body");
    final TermsEnum term = terms.iterator(null);

    double dotSum = 0, docVecSumSqrs = 0, numDims = 0, queryVecSumSqrs = 0;

    long termCount = 0;

    while (term.next() != null) {
        ++numDims;
        termCount = term.totalTermFreq();
        docVecSumSqrs += termCount * termCount;
        if (termSet.contains(new Term("body", term.term()))) {
            dotSum += termCount;
            ++queryVecSumSqrs;
            ret.TermFreqs.put(BytesRef.deepCopyOf(term.term()), termCount);
            ret.MaxTermFreq = Math.max(ret.MaxTermFreq, termCount);
            // System.err.println(Path + Name + " contains term " + term.term().utf8ToString() + ", with freq " + termCount);
        }
    }
    features[HitRanker.FCOSINE_SIMILARITY] = dotSum / (Math.sqrt(docVecSumSqrs) + Math.sqrt(queryVecSumSqrs));
    features[HitRanker.FTERM_CARDINALITY] = queryVecSumSqrs / termSet.size();

    // features[HitRanker.FTERM_LENGTH] 

    // features[HitRanker.FTERM_PRIORITY] = 0.0;
    return ret;
}

From source file:in.geocoder.component.GeocodingComponent.java

License:Apache License

private NamedList<Integer> getTerms(SolrIndexSearcher searcher, IndexSchema schema, String field)
        throws IOException {
    NamedList<Object> termsResult = new SimpleOrderedMap<Object>();

    boolean sort = true;

    boolean raw = false;

    final AtomicReader indexReader = searcher.getAtomicReader();
    Fields lfields = indexReader.fields();

    NamedList<Integer> fieldTerms = new NamedList<Integer>();
    termsResult.add(field, fieldTerms);/*from  w  ww .j  a  v a 2 s. co m*/

    Terms terms = lfields == null ? null : lfields.terms(field);
    if (terms == null) {
        // no terms for this field
        return new NamedList<Integer>();
    }

    FieldType ft = raw ? null : schema.getFieldTypeNoEx(field);
    if (ft == null)
        ft = new StrField();

    TermsEnum termsEnum = terms.iterator(null);
    BytesRef term = null;

    term = termsEnum.next();

    BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort
            ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(Integer.MAX_VALUE)
            : null);
    CharsRef external = new CharsRef();
    while (term != null) {
        boolean externalized = false; // did we fill in "external" yet for this term?

        // This is a good term in the range.  Check if mincount/maxcount conditions are satisfied.
        int docFreq = termsEnum.docFreq();
        // add the term to the list
        if (sort) {
            queue.add(new CountPair<BytesRef, Integer>(BytesRef.deepCopyOf(term), docFreq));
        } else {
            // TODO: handle raw somehow
            if (!externalized) {
                ft.indexedToReadable(term, external);
            }
            fieldTerms.add(external.toString(), docFreq);
        }

        term = termsEnum.next();
    }

    if (sort) {
        for (CountPair<BytesRef, Integer> item : queue) {
            ft.indexedToReadable(item.key, external);
            fieldTerms.add(external.toString(), item.val);
        }
    }

    return fieldTerms;
}

From source file:io.crate.executor.transport.TransportShardUpsertAction.java

License:Apache License

private IndexRequest prepareInsert(ShardUpsertRequest request, ShardUpsertRequest.Item item,
        SymbolToInputContext implContext) throws IOException {
    // collect inputs
    Set<CollectExpression<Row, ?>> collectExpressions = implContext.collectExpressions();
    for (CollectExpression<Row, ?> collectExpression : collectExpressions) {
        collectExpression.setNextRow(item.row());
    }//from   w  w  w. j  a  v a  2s  .  co  m

    BytesRef rawSource = null;
    XContentBuilder builder = XContentFactory.jsonBuilder().startObject();
    for (Map.Entry<Reference, Input<?>> entry : implContext.referenceInputMap.entrySet()) {
        ColumnIdent columnIdent = entry.getKey().ident().columnIdent();
        if (columnIdent.equals(DocSysColumns.RAW)) {
            rawSource = (BytesRef) entry.getValue().value();
            break;
        }
        builder.field(columnIdent.fqn(), entry.getValue().value());
    }
    IndexRequest indexRequest = Requests.indexRequest(request.index()).type(request.type()).id(item.id())
            .routing(request.routing()).create(!request.overwriteDuplicates()).operationThreaded(false);
    if (rawSource != null) {
        indexRequest.source(BytesRef.deepCopyOf(rawSource).bytes);
    } else {
        indexRequest.source(builder.bytes());
    }
    if (logger.isTraceEnabled()) {
        logger.trace("Inserting document with id {}, source: {}", item.id(), indexRequest.source().toUtf8());
    }
    return indexRequest;
}

From source file:io.crate.expression.reference.doc.lucene.BytesRefColumnReference.java

License:Apache License

@Override
public void setNextDocId(int docId) throws IOException {
    super.setNextDocId(docId);
    if (values.advanceExact(docId)) {
        if (values.docValueCount() == 1) {
            value = BytesRef.deepCopyOf(values.nextValue());
        } else {//  ww  w  . j a  v  a2 s .c  om
            throw new GroupByOnArrayUnsupportedException(columnName);
        }
    } else {
        value = null;
    }
}

From source file:io.crate.expression.reference.doc.lucene.IdCollectorExpression.java

License:Apache License

@Override
public void setNextDocId(int docId) throws IOException {
    super.setNextDocId(docId);
    if (values.advanceExact(docId)) {
        switch (values.docValueCount()) {
        case 1:/*from w  ww .  j  a  v a 2s .co m*/
            value = BytesRef.deepCopyOf(values.nextValue());
            break;

        default:
            throw new GroupByOnArrayUnsupportedException(columnName);
        }
    } else {
        value = null;
    }
}

From source file:io.crate.operation.reference.doc.lucene.BytesRefColumnReference.java

License:Apache License

@Override
public BytesRef value() throws ValidationException {
    switch (values.cardinality()) {
    case 0://  www.  j  a v a 2s  .com
        return null;
    case 1:
        return BytesRef.deepCopyOf(values.lookupOrd(values.ordAt(0)));
    default:
        throw new GroupByOnArrayUnsupportedException(columnName());
    }
}

From source file:jp.scaleout.elasticsearch.plugins.queryparser.classic.QueryParserBase.java

License:Apache License

protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
    if (analyzerIn == null)
        analyzerIn = getAnalyzer();//from   www. j  a  va  2 s .c  o  m

    try (TokenStream source = analyzerIn.tokenStream(field, part)) {
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
        //termAtt.fillBytesRef();
        if (source.incrementToken())
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing multiTerm term: " + part, e);
    }
}