Example usage for org.apache.lucene.util BytesRef deepCopyOf

List of usage examples for org.apache.lucene.util BytesRef deepCopyOf

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef deepCopyOf.

Prototype

public static BytesRef deepCopyOf(BytesRef other) 

Source Link

Document

Creates a new BytesRef that points to a copy of the bytes from other

The returned BytesRef will have a length of other.length and an offset of zero.

Usage

From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java

License:Open Source License

/**
 * Tokenizes a query string using Lucenes analyzer. This also removes
 * stopwords from the query string. Returns a mapping of query-term to
 * in-query-frequency. The {@link IndexDataProvider} instance is used to skip
 * terms no found in the collection.//from   ww  w  .ja  v a2 s  .  co m
 *
 * @param query Query String
 * @param qAnalyzer Analyzer used to parse the query String
 * @param dataProv IndexDataProvider
 * @return mapping of query-term to in-query-frequency with optionally terms
 * not in the collection skipped
 */
@SuppressWarnings("ObjectAllocationInLoop")
public static Map<BytesRef, Integer> tokenizeAndMapQuery(@NotNull final String query,
        @NotNull final Analyzer qAnalyzer, @Nullable final IndexDataProvider dataProv) {
    // estimate size
    final Map<BytesRef, Integer> result = new HashMap<>(
            (int) ((double) StringUtils.estimatedWordCount(query) * 1.8));
    try (TokenStream stream = qAnalyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (result.containsKey(term)) {
                result.put(BytesRef.deepCopyOf(term), result.get(term) + 1);
            } else {
                result.put(BytesRef.deepCopyOf(term), 1);
            }
        }
    } catch (final IOException e) {
        // not thrown b/c we're using a string reader
    }
    if (dataProv != null) {
        removeUnknownTerms(dataProv, result.keySet()).stream().forEach(result::remove);
    }
    return result;
}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.lucene.TermStats.java

License:Open Source License

TermStats(String field, BytesRef termtext, int df) {
    this.termtext = BytesRef.deepCopyOf(termtext);
    this.field = field;
    this.docFreq = df;
}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.lucene.TermStats.java

License:Open Source License

TermStats(String field, BytesRef termtext, int df, long tf) {
    this.termtext = BytesRef.deepCopyOf(termtext);
    this.field = field;
    this.docFreq = df;
    this.totalTermFreq = tf;
}

From source file:edu.utsa.sifter.Result.java

License:Apache License

public DocTermInfo docRankFactors(final double[] features, final Date refDate, final IndexReader rdr,
        final Set<Term> termSet) throws IOException {
    // J.S./* ww  w.j a  va  2 s  . c o  m*/
    final double[] featuresA = new double[19];
    final DocTermInfo ret = new DocTermInfo();
    final String lowerExt = Extension.toLowerCase();
    if (!isUnallocated()) {
        features[HitRanker.FCREATED] = dateDiff(Created, refDate);
        features[HitRanker.FMODIFIED] = dateDiff(Modified, refDate);
        features[HitRanker.FACCESSED] = dateDiff(Accessed, refDate);
        features[HitRanker.FAVG_RECENCY] = (features[HitRanker.FCREATED] + features[HitRanker.FMODIFIED]
                + features[HitRanker.FACCESSED]) / 3;
        features[HitRanker.FFILENAME_DIRECT] = 0;
        features[HitRanker.FFILENAME_INDIRECT] = 0;
        final String fullPath = Path + Name;
        for (Term t : termSet) {
            if (fullPath.indexOf(t.text()) > 0) {
                features[HitRanker.FFILENAME_INDIRECT] = 1;
                break;
            }
        }
        features[HitRanker.FUSER_DIRECTORY] = 0;
        for (String dir : SystemDirs) {
            if (Path.indexOf(dir) > -1) {
                features[HitRanker.FUSER_DIRECTORY] = 1;
                break;
            }
        }
    }
    features[HitRanker.FHIGH_PRIORITY_TYPE] = DocMaker.HighPriorityTypes.contains(lowerExt) ? 1 : 0;
    features[HitRanker.FMED_PRIORITY_TYPE] = DocMaker.MedPriorityTypes.contains(lowerExt) ? 1 : 0;
    features[HitRanker.FLOW_PRIORITY_TYPE] = features[HitRanker.FHIGH_PRIORITY_TYPE]
            + features[HitRanker.FMED_PRIORITY_TYPE] > 0 ? 0 : 1;

    final Terms terms = rdr.getTermVector(LuceneID, "body");
    final TermsEnum term = terms.iterator(null);

    double dotSum = 0, docVecSumSqrs = 0, numDims = 0, queryVecSumSqrs = 0;

    long termCount = 0;

    while (term.next() != null) {
        ++numDims;
        termCount = term.totalTermFreq();
        docVecSumSqrs += termCount * termCount;
        if (termSet.contains(new Term("body", term.term()))) {
            dotSum += termCount;
            ++queryVecSumSqrs;
            ret.TermFreqs.put(BytesRef.deepCopyOf(term.term()), termCount);
            ret.MaxTermFreq = Math.max(ret.MaxTermFreq, termCount);
            // System.err.println(Path + Name + " contains term " + term.term().utf8ToString() + ", with freq " + termCount);
        }
    }
    features[HitRanker.FCOSINE_SIMILARITY] = dotSum / (Math.sqrt(docVecSumSqrs) + Math.sqrt(queryVecSumSqrs));
    features[HitRanker.FTERM_CARDINALITY] = queryVecSumSqrs / termSet.size();

    // features[HitRanker.FTERM_LENGTH] 

    // features[HitRanker.FTERM_PRIORITY] = 0.0;
    return ret;
}

From source file:in.geocoder.component.GeocodingComponent.java

License:Apache License

private NamedList<Integer> getTerms(SolrIndexSearcher searcher, IndexSchema schema, String field)
        throws IOException {
    NamedList<Object> termsResult = new SimpleOrderedMap<Object>();

    boolean sort = true;

    boolean raw = false;

    final AtomicReader indexReader = searcher.getAtomicReader();
    Fields lfields = indexReader.fields();

    NamedList<Integer> fieldTerms = new NamedList<Integer>();
    termsResult.add(field, fieldTerms);/*from  w  ww .j  a  v a 2 s. co m*/

    Terms terms = lfields == null ? null : lfields.terms(field);
    if (terms == null) {
        // no terms for this field
        return new NamedList<Integer>();
    }

    FieldType ft = raw ? null : schema.getFieldTypeNoEx(field);
    if (ft == null)
        ft = new StrField();

    TermsEnum termsEnum = terms.iterator(null);
    BytesRef term = null;

    term = termsEnum.next();

    BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort
            ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(Integer.MAX_VALUE)
            : null);
    CharsRef external = new CharsRef();
    while (term != null) {
        boolean externalized = false; // did we fill in "external" yet for this term?

        // This is a good term in the range.  Check if mincount/maxcount conditions are satisfied.
        int docFreq = termsEnum.docFreq();
        // add the term to the list
        if (sort) {
            queue.add(new CountPair<BytesRef, Integer>(BytesRef.deepCopyOf(term), docFreq));
        } else {
            // TODO: handle raw somehow
            if (!externalized) {
                ft.indexedToReadable(term, external);
            }
            fieldTerms.add(external.toString(), docFreq);
        }

        term = termsEnum.next();
    }

    if (sort) {
        for (CountPair<BytesRef, Integer> item : queue) {
            ft.indexedToReadable(item.key, external);
            fieldTerms.add(external.toString(), item.val);
        }
    }

    return fieldTerms;
}

From source file:io.crate.executor.transport.TransportShardUpsertAction.java

License:Apache License

private IndexRequest prepareInsert(ShardUpsertRequest request, ShardUpsertRequest.Item item,
        SymbolToInputContext implContext) throws IOException {
    // collect inputs
    Set<CollectExpression<Row, ?>> collectExpressions = implContext.collectExpressions();
    for (CollectExpression<Row, ?> collectExpression : collectExpressions) {
        collectExpression.setNextRow(item.row());
    }//from   w  w  w. j  a  v a  2s  .  co  m

    BytesRef rawSource = null;
    XContentBuilder builder = XContentFactory.jsonBuilder().startObject();
    for (Map.Entry<Reference, Input<?>> entry : implContext.referenceInputMap.entrySet()) {
        ColumnIdent columnIdent = entry.getKey().ident().columnIdent();
        if (columnIdent.equals(DocSysColumns.RAW)) {
            rawSource = (BytesRef) entry.getValue().value();
            break;
        }
        builder.field(columnIdent.fqn(), entry.getValue().value());
    }
    IndexRequest indexRequest = Requests.indexRequest(request.index()).type(request.type()).id(item.id())
            .routing(request.routing()).create(!request.overwriteDuplicates()).operationThreaded(false);
    if (rawSource != null) {
        indexRequest.source(BytesRef.deepCopyOf(rawSource).bytes);
    } else {
        indexRequest.source(builder.bytes());
    }
    if (logger.isTraceEnabled()) {
        logger.trace("Inserting document with id {}, source: {}", item.id(), indexRequest.source().toUtf8());
    }
    return indexRequest;
}

From source file:io.crate.expression.reference.doc.lucene.BytesRefColumnReference.java

License:Apache License

@Override
public void setNextDocId(int docId) throws IOException {
    super.setNextDocId(docId);
    if (values.advanceExact(docId)) {
        if (values.docValueCount() == 1) {
            value = BytesRef.deepCopyOf(values.nextValue());
        } else {//  ww  w  . j a  v  a2 s .c  om
            throw new GroupByOnArrayUnsupportedException(columnName);
        }
    } else {
        value = null;
    }
}

From source file:io.crate.expression.reference.doc.lucene.IdCollectorExpression.java

License:Apache License

@Override
public void setNextDocId(int docId) throws IOException {
    super.setNextDocId(docId);
    if (values.advanceExact(docId)) {
        switch (values.docValueCount()) {
        case 1:/*from w  ww .  j  a  v a 2s .co m*/
            value = BytesRef.deepCopyOf(values.nextValue());
            break;

        default:
            throw new GroupByOnArrayUnsupportedException(columnName);
        }
    } else {
        value = null;
    }
}

From source file:io.crate.operation.reference.doc.lucene.BytesRefColumnReference.java

License:Apache License

@Override
public BytesRef value() throws ValidationException {
    switch (values.cardinality()) {
    case 0://  www.  j  a v a 2s  .com
        return null;
    case 1:
        return BytesRef.deepCopyOf(values.lookupOrd(values.ordAt(0)));
    default:
        throw new GroupByOnArrayUnsupportedException(columnName());
    }
}

From source file:jp.scaleout.elasticsearch.plugins.queryparser.classic.QueryParserBase.java

License:Apache License

protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) {
    if (analyzerIn == null)
        analyzerIn = getAnalyzer();//from   www. j  a  va  2 s .c  o  m

    try (TokenStream source = analyzerIn.tokenStream(field, part)) {
        source.reset();

        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        BytesRef bytes = termAtt.getBytesRef();

        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part);
        //termAtt.fillBytesRef();
        if (source.incrementToken())
            throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part);
        source.end();
        return BytesRef.deepCopyOf(bytes);
    } catch (IOException e) {
        throw new RuntimeException("Error analyzing multiTerm term: " + part, e);
    }
}