List of usage examples for org.apache.lucene.util BytesRef deepCopyOf
public static BytesRef deepCopyOf(BytesRef other)
other
The returned BytesRef will have a length of other.length and an offset of zero.
From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java
License:Open Source License
/** * Tokenizes a query string using Lucenes analyzer. This also removes * stopwords from the query string. Returns a mapping of query-term to * in-query-frequency. The {@link IndexDataProvider} instance is used to skip * terms no found in the collection.//from ww w .ja v a2 s . co m * * @param query Query String * @param qAnalyzer Analyzer used to parse the query String * @param dataProv IndexDataProvider * @return mapping of query-term to in-query-frequency with optionally terms * not in the collection skipped */ @SuppressWarnings("ObjectAllocationInLoop") public static Map<BytesRef, Integer> tokenizeAndMapQuery(@NotNull final String query, @NotNull final Analyzer qAnalyzer, @Nullable final IndexDataProvider dataProv) { // estimate size final Map<BytesRef, Integer> result = new HashMap<>( (int) ((double) StringUtils.estimatedWordCount(query) * 1.8)); try (TokenStream stream = qAnalyzer.tokenStream(null, query)) { stream.reset(); while (stream.incrementToken()) { final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class)); if (result.containsKey(term)) { result.put(BytesRef.deepCopyOf(term), result.get(term) + 1); } else { result.put(BytesRef.deepCopyOf(term), 1); } } } catch (final IOException e) { // not thrown b/c we're using a string reader } if (dataProv != null) { removeUnknownTerms(dataProv, result.keySet()).stream().forEach(result::remove); } return result; }
From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.lucene.TermStats.java
License:Open Source License
TermStats(String field, BytesRef termtext, int df) { this.termtext = BytesRef.deepCopyOf(termtext); this.field = field; this.docFreq = df; }
From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.lucene.TermStats.java
License:Open Source License
TermStats(String field, BytesRef termtext, int df, long tf) { this.termtext = BytesRef.deepCopyOf(termtext); this.field = field; this.docFreq = df; this.totalTermFreq = tf; }
From source file:edu.utsa.sifter.Result.java
License:Apache License
public DocTermInfo docRankFactors(final double[] features, final Date refDate, final IndexReader rdr, final Set<Term> termSet) throws IOException { // J.S./* ww w.j a va 2 s . c o m*/ final double[] featuresA = new double[19]; final DocTermInfo ret = new DocTermInfo(); final String lowerExt = Extension.toLowerCase(); if (!isUnallocated()) { features[HitRanker.FCREATED] = dateDiff(Created, refDate); features[HitRanker.FMODIFIED] = dateDiff(Modified, refDate); features[HitRanker.FACCESSED] = dateDiff(Accessed, refDate); features[HitRanker.FAVG_RECENCY] = (features[HitRanker.FCREATED] + features[HitRanker.FMODIFIED] + features[HitRanker.FACCESSED]) / 3; features[HitRanker.FFILENAME_DIRECT] = 0; features[HitRanker.FFILENAME_INDIRECT] = 0; final String fullPath = Path + Name; for (Term t : termSet) { if (fullPath.indexOf(t.text()) > 0) { features[HitRanker.FFILENAME_INDIRECT] = 1; break; } } features[HitRanker.FUSER_DIRECTORY] = 0; for (String dir : SystemDirs) { if (Path.indexOf(dir) > -1) { features[HitRanker.FUSER_DIRECTORY] = 1; break; } } } features[HitRanker.FHIGH_PRIORITY_TYPE] = DocMaker.HighPriorityTypes.contains(lowerExt) ? 1 : 0; features[HitRanker.FMED_PRIORITY_TYPE] = DocMaker.MedPriorityTypes.contains(lowerExt) ? 1 : 0; features[HitRanker.FLOW_PRIORITY_TYPE] = features[HitRanker.FHIGH_PRIORITY_TYPE] + features[HitRanker.FMED_PRIORITY_TYPE] > 0 ? 0 : 1; final Terms terms = rdr.getTermVector(LuceneID, "body"); final TermsEnum term = terms.iterator(null); double dotSum = 0, docVecSumSqrs = 0, numDims = 0, queryVecSumSqrs = 0; long termCount = 0; while (term.next() != null) { ++numDims; termCount = term.totalTermFreq(); docVecSumSqrs += termCount * termCount; if (termSet.contains(new Term("body", term.term()))) { dotSum += termCount; ++queryVecSumSqrs; ret.TermFreqs.put(BytesRef.deepCopyOf(term.term()), termCount); ret.MaxTermFreq = Math.max(ret.MaxTermFreq, termCount); // System.err.println(Path + Name + " contains term " + term.term().utf8ToString() + ", with freq " + termCount); } } features[HitRanker.FCOSINE_SIMILARITY] = dotSum / (Math.sqrt(docVecSumSqrs) + Math.sqrt(queryVecSumSqrs)); features[HitRanker.FTERM_CARDINALITY] = queryVecSumSqrs / termSet.size(); // features[HitRanker.FTERM_LENGTH] // features[HitRanker.FTERM_PRIORITY] = 0.0; return ret; }
From source file:in.geocoder.component.GeocodingComponent.java
License:Apache License
private NamedList<Integer> getTerms(SolrIndexSearcher searcher, IndexSchema schema, String field) throws IOException { NamedList<Object> termsResult = new SimpleOrderedMap<Object>(); boolean sort = true; boolean raw = false; final AtomicReader indexReader = searcher.getAtomicReader(); Fields lfields = indexReader.fields(); NamedList<Integer> fieldTerms = new NamedList<Integer>(); termsResult.add(field, fieldTerms);/*from w ww .j a v a 2 s. co m*/ Terms terms = lfields == null ? null : lfields.terms(field); if (terms == null) { // no terms for this field return new NamedList<Integer>(); } FieldType ft = raw ? null : schema.getFieldTypeNoEx(field); if (ft == null) ft = new StrField(); TermsEnum termsEnum = terms.iterator(null); BytesRef term = null; term = termsEnum.next(); BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(Integer.MAX_VALUE) : null); CharsRef external = new CharsRef(); while (term != null) { boolean externalized = false; // did we fill in "external" yet for this term? // This is a good term in the range. Check if mincount/maxcount conditions are satisfied. int docFreq = termsEnum.docFreq(); // add the term to the list if (sort) { queue.add(new CountPair<BytesRef, Integer>(BytesRef.deepCopyOf(term), docFreq)); } else { // TODO: handle raw somehow if (!externalized) { ft.indexedToReadable(term, external); } fieldTerms.add(external.toString(), docFreq); } term = termsEnum.next(); } if (sort) { for (CountPair<BytesRef, Integer> item : queue) { ft.indexedToReadable(item.key, external); fieldTerms.add(external.toString(), item.val); } } return fieldTerms; }
From source file:io.crate.executor.transport.TransportShardUpsertAction.java
License:Apache License
private IndexRequest prepareInsert(ShardUpsertRequest request, ShardUpsertRequest.Item item, SymbolToInputContext implContext) throws IOException { // collect inputs Set<CollectExpression<Row, ?>> collectExpressions = implContext.collectExpressions(); for (CollectExpression<Row, ?> collectExpression : collectExpressions) { collectExpression.setNextRow(item.row()); }//from w w w. j a v a 2s . co m BytesRef rawSource = null; XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); for (Map.Entry<Reference, Input<?>> entry : implContext.referenceInputMap.entrySet()) { ColumnIdent columnIdent = entry.getKey().ident().columnIdent(); if (columnIdent.equals(DocSysColumns.RAW)) { rawSource = (BytesRef) entry.getValue().value(); break; } builder.field(columnIdent.fqn(), entry.getValue().value()); } IndexRequest indexRequest = Requests.indexRequest(request.index()).type(request.type()).id(item.id()) .routing(request.routing()).create(!request.overwriteDuplicates()).operationThreaded(false); if (rawSource != null) { indexRequest.source(BytesRef.deepCopyOf(rawSource).bytes); } else { indexRequest.source(builder.bytes()); } if (logger.isTraceEnabled()) { logger.trace("Inserting document with id {}, source: {}", item.id(), indexRequest.source().toUtf8()); } return indexRequest; }
From source file:io.crate.expression.reference.doc.lucene.BytesRefColumnReference.java
License:Apache License
@Override public void setNextDocId(int docId) throws IOException { super.setNextDocId(docId); if (values.advanceExact(docId)) { if (values.docValueCount() == 1) { value = BytesRef.deepCopyOf(values.nextValue()); } else {// ww w . j a v a2 s .c om throw new GroupByOnArrayUnsupportedException(columnName); } } else { value = null; } }
From source file:io.crate.expression.reference.doc.lucene.IdCollectorExpression.java
License:Apache License
@Override public void setNextDocId(int docId) throws IOException { super.setNextDocId(docId); if (values.advanceExact(docId)) { switch (values.docValueCount()) { case 1:/*from w ww . j a v a 2s .co m*/ value = BytesRef.deepCopyOf(values.nextValue()); break; default: throw new GroupByOnArrayUnsupportedException(columnName); } } else { value = null; } }
From source file:io.crate.operation.reference.doc.lucene.BytesRefColumnReference.java
License:Apache License
@Override public BytesRef value() throws ValidationException { switch (values.cardinality()) { case 0:// www. j a v a 2s .com return null; case 1: return BytesRef.deepCopyOf(values.lookupOrd(values.ordAt(0))); default: throw new GroupByOnArrayUnsupportedException(columnName()); } }
From source file:jp.scaleout.elasticsearch.plugins.queryparser.classic.QueryParserBase.java
License:Apache License
protected BytesRef analyzeMultitermTerm(String field, String part, Analyzer analyzerIn) { if (analyzerIn == null) analyzerIn = getAnalyzer();//from www. j a va 2 s .c o m try (TokenStream source = analyzerIn.tokenStream(field, part)) { source.reset(); TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class); BytesRef bytes = termAtt.getBytesRef(); if (!source.incrementToken()) throw new IllegalArgumentException("analyzer returned no terms for multiTerm term: " + part); //termAtt.fillBytesRef(); if (source.incrementToken()) throw new IllegalArgumentException("analyzer returned too many terms for multiTerm term: " + part); source.end(); return BytesRef.deepCopyOf(bytes); } catch (IOException e) { throw new RuntimeException("Error analyzing multiTerm term: " + part, e); } }