List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:org.apache.blur.analysis.type.AclDiscoverFieldTypeDefinition.java
License:Apache License
@Override public String readTerm(BytesRef byteRef) { return byteRef.utf8ToString(); }
From source file:org.apache.blur.analysis.type.spatial.BaseSpatialFieldTypeDefinitionTest.java
License:Apache License
protected void runGisDocValueTest(String s) throws IOException { DirectoryReader reader = DirectoryReader.open(_dir); AtomicReader atomicReader = reader.leaves().get(0).reader(); SortedDocValues sortedDocValues = atomicReader.getSortedDocValues("fam.geo"); BytesRef result = new BytesRef(); sortedDocValues.get(0, result);/*from w ww. j av a 2s . c om*/ assertEquals(s, result.utf8ToString()); System.out.println(result.utf8ToString()); reader.close(); }
From source file:org.apache.blur.command.TermsCommand.java
License:Apache License
private static List<String> terms(IndexReader reader, String fieldName, String startWith, short size) throws IOException { Term term = getTerm(fieldName, startWith); List<String> terms = new ArrayList<String>(size); AtomicReader areader = BlurUtil.getAtomicReader(reader); Terms termsAll = areader.terms(term.field()); if (termsAll == null) { return terms; }/*from w w w . ja v a 2 s . co m*/ TermsEnum termEnum = termsAll.iterator(null); SeekStatus status = termEnum.seekCeil(term.bytes()); if (status == SeekStatus.END) { return terms; } BytesRef currentTermText = termEnum.term(); do { terms.add(currentTermText.utf8ToString()); if (terms.size() >= size) { return terms; } } while ((currentTermText = termEnum.next()) != null); return terms; }
From source file:org.apache.blur.lucene.serializer.SerializerUtil.java
License:Apache License
public static String readString(DataInput in) throws IOException { BytesRef bytes = readBytesRef(in); return bytes.utf8ToString(); }
From source file:org.apache.blur.manager.IndexManager.java
License:Apache License
public static List<String> terms(IndexReader reader, FieldTypeDefinition typeDef, String columnFamily, String columnName, String startWith, short size) throws IOException { if (startWith == null) { startWith = ""; }//from w w w .j av a 2 s . c om Term term = getTerm(columnFamily, columnName, startWith); List<String> terms = new ArrayList<String>(size); AtomicReader areader = BlurUtil.getAtomicReader(reader); Terms termsAll = areader.terms(term.field()); if (termsAll == null) { return terms; } TermsEnum termEnum = termsAll.iterator(null); SeekStatus status = termEnum.seekCeil(term.bytes()); if (status == SeekStatus.END) { return terms; } BytesRef currentTermText = termEnum.term(); do { terms.add(currentTermText.utf8ToString()); String readTerm = typeDef.readTerm(currentTermText); if (readTerm != null) terms.add(readTerm); if (terms.size() >= size) { return terms; } } while ((currentTermText = termEnum.next()) != null); return terms; }
From source file:org.apache.ctakes.utils.wiki.WikiIndex.java
License:Apache License
/** * Return a hash table that maps terms to their tfidf values. * The input is a list of TermFreqVector objects. The return * value is formed by summing up individual tfidf vectors. *//*from ww w . ja va2s . com*/ private HashMap<String, Double> makeTfIdfVector(ArrayList<Terms> termFreqVectors) throws IOException { // map terms to their tfidf values CounterMap<String> countVector = new CounterMap<String>(); HashMap<String, Double> tfIdfVector = new HashMap<String, Double>(); for (Terms terms : termFreqVectors) { if (terms == null) { continue; // some documents are empty } // String[] terms = termFreqVector.getTerms(); // int[] freqs = termFreqVector.getTermFrequencies(); TermsEnum termsEnum = terms.iterator(null); while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); String termStr = term.utf8ToString(); countVector.add(termStr); } for (String key : countVector.keySet()) { double tf = similarity.tf((long) countVector.get(key)); double idf = similarity.idf(indexReader.docFreq(new Term("text", key)), numDocs); tfIdfVector.put(key, tf * idf); } /* for(int i = 0; i < terms.length; i++) { double tf = similarity.tf(freqs[i]); // defaultSimilarity.tf(freqs[i]); double idf = similarity.idf(indexReader.docFreq(new Term("text", terms[i])), numDocs); if(tfIdfVector.containsKey(terms[i])) { tfIdfVector.put(terms[i], tfIdfVector.get(terms[i]) + tf * idf); } else { tfIdfVector.put(terms[i], tf * idf); } } */ } return tfIdfVector; }
From source file:org.apache.mahout.utils.vectors.lucene.CachedTermInfo.java
License:Apache License
public CachedTermInfo(IndexReader reader, String field, int minDf, int maxDfPercent) throws IOException { this.field = field; Terms t = MultiFields.getTerms(reader, field); TermsEnum te = t.iterator(null);//from ww w . j av a 2 s.c o m int numDocs = reader.numDocs(); double percent = numDocs * maxDfPercent / 100.0; //Should we use a linked hash map so that we know terms are in order? termEntries = Maps.newLinkedHashMap(); int count = 0; BytesRef text; while ((text = te.next()) != null) { int df = te.docFreq(); if (df >= minDf && df <= percent) { TermEntry entry = new TermEntry(text.utf8ToString(), count++, df); termEntries.put(entry.getTerm(), entry); } } }
From source file:org.apache.mahout.utils.vectors.lucene.ClusterLabels.java
License:Apache License
/** * Get the list of labels, sorted by best score. *//*from ww w. j a v a 2 s.c o m*/ protected List<TermInfoClusterInOut> getClusterLabels(Integer integer, Collection<WeightedPropertyVectorWritable> wpvws) throws IOException { if (wpvws.size() < minNumIds) { log.info("Skipping small cluster {} with size: {}", integer, wpvws.size()); return null; } log.info("Processing Cluster {} with {} documents", integer, wpvws.size()); Directory dir = FSDirectory.open(new File(this.indexDir)); IndexReader reader = DirectoryReader.open(dir); log.info("# of documents in the index {}", reader.numDocs()); Collection<String> idSet = Sets.newHashSet(); for (WeightedPropertyVectorWritable wpvw : wpvws) { Vector vector = wpvw.getVector(); if (vector instanceof NamedVector) { idSet.add(((NamedVector) vector).getName()); } } int numDocs = reader.numDocs(); OpenBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField); log.info("Populating term infos from the index"); /** * This code is as that of CachedTermInfo, with one major change, which is to get the document frequency. * * Since we have deleted the documents out of the cluster, the document frequency for a term should only * include the in-cluster documents. The document frequency obtained from TermEnum reflects the frequency * in the entire index. To get the in-cluster frequency, we need to query the index to get the term * frequencies in each document. The number of results of this call will be the in-cluster document * frequency. */ Terms t = MultiFields.getTerms(reader, contentField); TermsEnum te = t.iterator(null); Map<String, TermEntry> termEntryMap = new LinkedHashMap<String, TermEntry>(); Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are no deletions int count = 0; BytesRef term; while ((term = te.next()) != null) { OpenBitSet termBitset = new OpenBitSet(reader.maxDoc()); DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, contentField, term); int docID; while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //check to see if we don't have an deletions (null) or if document is live if (liveDocs != null && !liveDocs.get(docID)) { // document is deleted... termBitset.set(docsEnum.docID()); } } // AND the term's bitset with cluster doc bitset to get the term's in-cluster frequency. // This modifies the termBitset, but that's fine as we are not using it anywhere else. termBitset.and(clusterDocBitset); int inclusterDF = (int) termBitset.cardinality(); TermEntry entry = new TermEntry(term.utf8ToString(), count++, inclusterDF); termEntryMap.put(entry.getTerm(), entry); } List<TermInfoClusterInOut> clusteredTermInfo = Lists.newLinkedList(); int clusterSize = wpvws.size(); for (TermEntry termEntry : termEntryMap.values()) { int corpusDF = reader.docFreq(new Term(this.contentField, termEntry.getTerm())); int outDF = corpusDF - termEntry.getDocFreq(); int inDF = termEntry.getDocFreq(); double logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs); TermInfoClusterInOut termInfoCluster = new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF, logLikelihoodRatio); clusteredTermInfo.add(termInfoCluster); } Collections.sort(clusteredTermInfo); // Cleanup Closeables.close(reader, true); termEntryMap.clear(); return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels)); }
From source file:org.apache.mahout.utils.vectors.lucene.TFDFMapper.java
License:Apache License
public void map(BytesRef term, int frequency) { TermEntry entry = termInfo.getTermEntry(field, term.utf8ToString()); if (entry != null) { vector.setQuick(entry.getTermIdx(), weight.calculate(frequency, entry.getDocFreq(), (int) numTerms, numDocs)); }/*from ww w.j av a 2 s.c o m*/ }
From source file:org.apache.oodt.cas.filemgr.browser.model.QueryBuilder.java
License:Apache License
public void GenerateCASQuery(org.apache.oodt.cas.filemgr.structs.Query casQ, org.apache.lucene.search.Query luceneQ) { if (luceneQ instanceof TermQuery) { Term t = ((TermQuery) luceneQ).getTerm(); if (!t.field().equals("__FREE__")) { String element = database.getElementID(t.field()); if (!element.equals("") && !t.text().equals("")) { casQ.addCriterion(new TermQueryCriteria(element, t.text())); }/* w w w . j a va 2 s .com*/ } } else if (luceneQ instanceof PhraseQuery) { Term[] t = ((PhraseQuery) luceneQ).getTerms(); if (!t[0].field().equals("__FREE__")) { for (Term aT : t) { String element = database.getElementID(aT.field()); if (!element.equals("") && !aT.text().equals("")) { casQ.addCriterion(new TermQueryCriteria(element, aT.text())); } } } } else if (luceneQ instanceof TermRangeQuery) { BytesRef startT = ((TermRangeQuery) luceneQ).getLowerTerm(); BytesRef endT = ((TermRangeQuery) luceneQ).getUpperTerm(); String element = database.getElementID(((TermRangeQuery) luceneQ).getField()); if (!element.equals("") && !startT.utf8ToString().equals("") && !endT.utf8ToString().equals("")) { casQ.addCriterion(new RangeQueryCriteria(element, startT.utf8ToString(), endT.utf8ToString())); } } else if (luceneQ instanceof BooleanQuery) { List<BooleanClause> clauses = ((BooleanQuery) luceneQ).clauses(); for (BooleanClause clause : clauses) { GenerateCASQuery(casQ, (clause).getQuery()); } } else { System.out.println("Error Parsing Query"); System.exit(-1); } }