List of usage examples for org.apache.lucene.util BytesRef deepCopyOf
public static BytesRef deepCopyOf(BytesRef other)
other
The returned BytesRef will have a length of other.length and an offset of zero.
From source file:biospectra.classify.Classifier.java
License:Apache License
private void createNaiveKmerQueryClauses(BooleanQuery.Builder builder, String field, CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt) throws IOException { while (stream.incrementToken()) { Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef())); builder.add(new TermQuery(t), BooleanClause.Occur.SHOULD); }/*from w w w .j a v a 2s .c o m*/ }
From source file:biospectra.classify.Classifier.java
License:Apache License
private void createChainProximityQueryClauses(BooleanQuery.Builder builder, String field, CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt) throws IOException { Term termArr[] = new Term[2]; long offsetArr[] = new long[2]; for (int i = 0; i < 2; i++) { termArr[i] = null;/* www.java 2 s . com*/ offsetArr[i] = 0; } while (stream.incrementToken()) { Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef())); if (termArr[0] == null) { termArr[0] = t; offsetArr[0] = offsetAtt.startOffset(); } else if (termArr[1] == null) { termArr[1] = t; offsetArr[1] = offsetAtt.startOffset(); } else { // shift termArr[0] = termArr[1]; offsetArr[0] = offsetArr[1]; // fill termArr[1] = t; offsetArr[1] = offsetAtt.startOffset(); } if (termArr[0] != null && termArr[1] != null) { long offsetDiff = offsetArr[1] - offsetArr[0]; if (offsetDiff > 0) { PhraseQuery.Builder pq = new PhraseQuery.Builder(); pq.setSlop((int) (offsetDiff) + 1); pq.add(termArr[0]); pq.add(termArr[1]); builder.add(pq.build(), BooleanClause.Occur.SHOULD); } } } }
From source file:biospectra.classify.Classifier.java
License:Apache License
private void createPairedProximityQueryClauses(BooleanQuery.Builder builder, String field, CachingTokenFilter stream, TermToBytesRefAttribute termAtt, OffsetAttribute offsetAtt) throws IOException { Term termArr[] = new Term[2]; long offsetArr[] = new long[2]; for (int i = 0; i < 2; i++) { termArr[i] = null;//from w w w.jav a2s .c o m offsetArr[i] = 0; } int count = 0; while (stream.incrementToken()) { Term t = new Term(field, BytesRef.deepCopyOf(termAtt.getBytesRef())); if (count % 2 == 0) { termArr[0] = t; offsetArr[0] = offsetAtt.startOffset(); } else { termArr[1] = t; offsetArr[1] = offsetAtt.startOffset(); long offsetDiff = offsetArr[1] - offsetArr[0]; if (offsetDiff > 0) { PhraseQuery.Builder pq = new PhraseQuery.Builder(); pq.setSlop((int) (offsetDiff) + 1); pq.add(termArr[0]); pq.add(termArr[1]); builder.add(pq.build(), BooleanClause.Occur.SHOULD); } termArr[0] = null; termArr[1] = null; } count++; } if (termArr[0] != null) { builder.add(new TermQuery(termArr[0]), BooleanClause.Occur.SHOULD); termArr[0] = null; } }
From source file:com.basho.yokozuna.handler.EntropyData.java
License:Open Source License
@Override public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception, InstantiationException, IllegalAccessException { String contParam = req.getParams().get("continue"); BytesRef cont = contParam != null ? decodeCont(contParam) : DEFAULT_CONT; // TODO: Make before required in handler config String before = req.getParams().get("before"); if (before == null) { throw new Exception("Parameter 'before' is required"); }//from w ww.j a va 2 s .c o m int n = req.getParams().getInt("n", DEFAULT_N); SolrDocumentList docs = new SolrDocumentList(); // Add docs here and modify object inline in code rsp.add("response", docs); try { SolrIndexSearcher searcher = req.getSearcher(); AtomicReader rdr = searcher.getAtomicReader(); BytesRef tmp = null; Terms terms = rdr.terms(ENTROPY_DATA_FIELD); TermsEnum te = terms.iterator(null); if (isContinue(cont)) { log.debug("continue from " + cont); TermsEnum.SeekStatus status = te.seekCeil(cont, true); if (status == TermsEnum.SeekStatus.END) { rsp.add("more", false); return; } else if (status == TermsEnum.SeekStatus.FOUND) { // If this term has already been seen then skip it. tmp = te.next(); if (endOfItr(tmp)) { rsp.add("more", false); return; } } else if (status == TermsEnum.SeekStatus.NOT_FOUND) { tmp = te.next(); } } else { tmp = te.next(); } String text = null; String[] vals = null; String ts = null; String docId = null; String vectorClock = null; int count = 0; BytesRef current = null; while (!endOfItr(tmp) && count < n) { current = BytesRef.deepCopyOf(tmp); text = tmp.utf8ToString(); log.debug("text: " + text); vals = text.split(" "); ts = vals[0]; // TODO: what if null? if (!(ts.compareTo(before) < 0)) { rsp.add("more", false); docs.setNumFound(count); return; } docId = vals[1]; vectorClock = vals[2]; SolrDocument tmpDoc = new SolrDocument(); tmpDoc.addField("doc_id", docId); tmpDoc.addField("base64_vclock", Base64.encodeBase64String(sha(vectorClock))); docs.add(tmpDoc); count++; tmp = te.next(); } if (count < n) { rsp.add("more", false); } else { rsp.add("more", true); String newCont = Base64.encodeBase64URLSafeString(current.bytes); // The continue context for next req to start where // this one finished. rsp.add("continuation", newCont); } docs.setNumFound(count); } catch (Exception e) { e.printStackTrace(); } }
From source file:com.meizu.nlp.classification.CachingNaiveBayesClassifier.java
License:Apache License
/** * This function is building the frame of the cache. The cache is storing the * word occurrences to the memory after those searched once. This cache can * made 2-100x speedup in proper use, but can eat lot of memory. There is an * option to lower the memory consume, if a word have really low occurrence in * the index you could filter it out. The other parameter is switching between * the term searching, if it true, just the terms in the skeleton will be * searched, but if it false the terms whoes not in the cache will be searched * out too (but not cached)./*from w w w . j a va 2 s .c o m*/ * * @param minTermOccurrenceInCache Lower cache size with higher value. * @param justCachedTerms The switch for fully exclude low occurrence docs. * @throws IOException If there is a low-level I/O error. */ public void reInitCache(int minTermOccurrenceInCache, boolean justCachedTerms) throws IOException { this.justCachedTerms = justCachedTerms; this.docsWithClassSize = countDocsWithClass(); termCClassHitCache.clear(); cclasses.clear(); classTermFreq.clear(); // build the cache for the word Map<String, Long> frequencyMap = new HashMap<>(); for (String textFieldName : textFieldNames) { TermsEnum termsEnum = leafReader.terms(textFieldName).iterator(); while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); String termText = term.utf8ToString(); long frequency = termsEnum.docFreq(); Long lastfreq = frequencyMap.get(termText); if (lastfreq != null) frequency += lastfreq; frequencyMap.put(termText, frequency); } } for (Map.Entry<String, Long> entry : frequencyMap.entrySet()) { if (entry.getValue() > minTermOccurrenceInCache) { termCClassHitCache.put(entry.getKey(), new ConcurrentHashMap<BytesRef, Integer>()); } } // fill the class list Terms terms = MultiFields.getTerms(leafReader, classFieldName); TermsEnum termsEnum = terms.iterator(); while ((termsEnum.next()) != null) { cclasses.add(BytesRef.deepCopyOf(termsEnum.term())); } // fill the classTermFreq map for (BytesRef cclass : cclasses) { double avgNumberOfUniqueTerms = 0; for (String textFieldName : textFieldNames) { terms = MultiFields.getTerms(leafReader, textFieldName); long numPostings = terms.getSumDocFreq(); // number of term/doc pairs avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount(); } int docsWithC = leafReader.docFreq(new Term(classFieldName, cclass)); classTermFreq.put(cclass, avgNumberOfUniqueTerms * docsWithC); } }
From source file:com.meizu.nlp.classification.SimpleNaiveBayesClassifier.java
License:Apache License
private List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException { if (leafReader == null) { throw new IOException("You must first call Classifier#train"); }/* ww w. j a va2s . com*/ List<ClassificationResult<BytesRef>> dataList = new ArrayList<>(); Terms terms = MultiFields.getTerms(leafReader, classFieldName); TermsEnum termsEnum = terms.iterator(); BytesRef next; String[] tokenizedDoc = tokenizeDoc(inputDocument); int docsWithClassSize = countDocsWithClass(); int count = 0; while ((next = termsEnum.next()) != null) { double clVal = calculateLogPrior(next, docsWithClassSize) + calculateLogLikelihood(tokenizedDoc, next, docsWithClassSize); dataList.add(new ClassificationResult<>(BytesRef.deepCopyOf(next), clVal)); count++; } // normalization; the values transforms to a 0-1 range ArrayList<ClassificationResult<BytesRef>> returnList = new ArrayList<>(); if (!dataList.isEmpty()) { Collections.sort(dataList); // this is a negative number closest to 0 = a double smax = dataList.get(0).getScore(); double sumLog = 0; // log(sum(exp(x_n-a))) for (ClassificationResult<BytesRef> cr : dataList) { // getScore-smax <=0 (both negative, smax is the smallest abs() sumLog += Math.exp(cr.getScore() - smax); } // loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n))) double loga = smax; loga += Math.log(sumLog); // 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum)) for (ClassificationResult<BytesRef> cr : dataList) { returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(cr.getScore() - loga))); } } return returnList; }
From source file:com.meizu.nlp.classification.SimpleNaiveBayesClassifier.java
License:Apache License
private List<ClassificationResult<BytesRef>> assignRankClassNormalizedList(String inputDocument) throws IOException { if (leafReader == null) { throw new IOException("You must first call Classifier#train"); }// w w w. ja va 2 s . c o m List<ClassificationResult<BytesRef>> dataList = new ArrayList<>(); if (this.rankClassFieldNames == null || this.rankClassFieldNames.length == 0) { throw new IOException("rankClassField must defind"); } for (String rankClassName : rankClassFieldNames) { } Terms terms = MultiFields.getTerms(leafReader, classFieldName); TermsEnum termsEnum = terms.iterator(); BytesRef next; String[] tokenizedDoc = tokenizeDoc(inputDocument); int docsWithClassSize = countDocsWithClass(); while ((next = termsEnum.next()) != null) { double clVal = calculateLogPrior(next, docsWithClassSize) + calculateLogLikelihood(tokenizedDoc, next, docsWithClassSize); dataList.add(new ClassificationResult<>(BytesRef.deepCopyOf(next), clVal)); } // normalization; the values transforms to a 0-1 range ArrayList<ClassificationResult<BytesRef>> returnList = new ArrayList<>(); if (!dataList.isEmpty()) { Collections.sort(dataList); // this is a negative number closest to 0 = a double smax = dataList.get(0).getScore(); double sumLog = 0; // log(sum(exp(x_n-a))) for (ClassificationResult<BytesRef> cr : dataList) { // getScore-smax <=0 (both negative, smax is the smallest abs() sumLog += Math.exp(cr.getScore() - smax); } // loga=a+log(sum(exp(x_n-a))) = log(sum(exp(x_n))) double loga = smax; loga += Math.log(sumLog); // 1/sum*x = exp(log(x))*1/sum = exp(log(x)-log(sum)) for (ClassificationResult<BytesRef> cr : dataList) { returnList.add(new ClassificationResult<>(cr.getAssignedClass(), Math.exp(cr.getScore() - loga))); } } return returnList; }
From source file:com.meltwater.elasticsearch.search.suggest.PrefixSuggester.java
License:Apache License
private Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> getOptions(Term text, PrefixTermsEnum prefixTermsEnum, IndexReader indexReader, final int size) throws IOException { OptionQueue collectionQueue = new OptionQueue(size); BytesRef ref;/* ww w . j a va 2 s . c o m*/ while ((ref = prefixTermsEnum.next()) != null) { Term term = new Term(text.field(), BytesRef.deepCopyOf(ref)); collectionQueue.insertWithOverflow(new Suggest.Suggestion.Entry.Option( new StringText(term.bytes().utf8ToString()), indexReader.docFreq(term))); } Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option> entry = new Suggest.Suggestion.Entry<Suggest.Suggestion.Entry.Option>( new StringText(text.text()), 0, text.bytes().length); while (collectionQueue.size() > 0) { entry.addOption(collectionQueue.pop()); } return entry; }
From source file:com.meltwater.elasticsearch.search.suggest.PrefixSuggester.java
License:Apache License
private List<Term> queryTerms(SuggestionSearchContext.SuggestionContext suggestion, CharsRefBuilder spare) throws IOException { final String field = suggestion.getField(); final List<Term> ret = new ArrayList<Term>(); SuggestUtils.analyze(suggestion.getAnalyzer(), suggestion.getText(), field, new SuggestUtils.TokenConsumer() { @Override/* w ww .j a va2 s . c o m*/ public void nextToken() { ret.add(new Term(field, BytesRef.deepCopyOf(fillBytesRef(new BytesRefBuilder())))); } }, spare); return ret; }
From source file:com.rocana.lucene.codec.v1.RocanaIntersectTermsEnum.java
License:Apache License
private boolean setSavedStartTerm(BytesRef startTerm) { savedStartTerm = startTerm == null ? null : BytesRef.deepCopyOf(startTerm); return true; }