List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:lab_mri.RocchioExpander.java
private float getScore(List<SearchResult> results, String term) throws IOException { float tfidf = 0; try (IndexReader idxreader = IndexReader .open(FSDirectory.open(new File("/home/luigi/NetBeansProjects/LAB_mri/inv_index")))) { for (SearchResult res : results) { Terms termVector = idxreader.getTermVector(Integer.parseInt(res.getId()) - 1, "abst"); int docsnum = idxreader.numDocs(); TermsEnum itr = null;/*from www . j ava 2 s. c o m*/ if (termVector != null) { itr = termVector.iterator(null); BytesRef this_term = null; while ((this_term = itr.next()) != null) { String termTxt = this_term.utf8ToString(); if (term.equalsIgnoreCase(termTxt)) { double tf = itr.totalTermFreq(); double df = idxreader.docFreq(new Term("abst", term)); float idf = (float) Math.log(docsnum / df); tfidf = (float) (tf * idf); return tfidf; } } } } } return tfidf; }
From source file:lia.chapter5.CategorizerTest.java
License:Apache License
private void addTermFreqToMap(Map vectorMap, Terms termsVector) throws IOException { TermsEnum termsEnum = termsVector.iterator(); BytesRef bytesRef = termsEnum.next(); while (bytesRef != null) { String term = bytesRef.utf8ToString(); System.out.println(term + " " + termsEnum.totalTermFreq()); if (vectorMap.containsKey(term)) { Long value = (Long) vectorMap.get(term); vectorMap.put(term, new Long(value.intValue() + termsEnum.totalTermFreq())); } else {/* ww w . ja va2 s . c o m*/ vectorMap.put(term, new Long(termsEnum.totalTermFreq())); } bytesRef = termsEnum.next(); } System.out.println(); }
From source file:lucene.CosineDocumentSimilarity.java
Map<String, Integer> getTermFrequencies(IndexReader reader, int docId) throws IOException { Terms vector = reader.getTermVector(docId, CONTENT); TermsEnum termsEnum = null;/*from ww w . j a v a 2 s. c o m*/ // termsEnum = vector.iterator(termsEnum); Map<String, Integer> frequencies = new HashMap<>(); BytesRef text = null; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int freq = (int) termsEnum.totalTermFreq(); frequencies.put(term, freq); terms.add(term); } return frequencies; }
From source file:lucene.security.index.SecureAtomicReaderTestBase.java
License:Apache License
@Test public void testTermWalk() throws IOException, ParseException { SecureAtomicReader secureReader = getSecureReader(); Fields fields = secureReader.fields(); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(null); BytesRef ref; while ((ref = termsEnum.next()) != null) { System.out.println(field + " " + ref.utf8ToString()); DocsEnum docsEnum = termsEnum.docs(null, null); int doc; while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { System.out.println(field + " " + ref.utf8ToString() + " " + doc); }//from w w w .ja v a 2 s . c om } } secureReader.close(); }
From source file:lucenesearch.NGram.java
private ArrayList<String[]> get2Gram(ExtendedDocument doc) throws IOException { ArrayList<String[]> res = new ArrayList<>(); Terms t = doc.getTermVector("Body"); TermsEnum itr = t.iterator();/* w w w. j a va2 s . c om*/ BytesRef term; ArrayList<String> terms = new ArrayList<>(); while ((term = itr.next()) != null) { String termText = term.utf8ToString(); terms.add(termText); } int n = terms.size(); for (int i = 1; i < n; i++) { String[] temp = new String[2]; temp[0] = terms.get(i - 1); temp[1] = terms.get(i); res.add(temp); } return res; }
From source file:lucenesearch.NGram.java
private ArrayList<String[]> get4Gram(ExtendedDocument doc) throws IOException { ArrayList<String[]> res = new ArrayList<>(); Terms t = doc.getTermVector("Body"); TermsEnum itr = t.iterator();//from w w w .j a v a2 s . c o m BytesRef term; ArrayList<String> terms = new ArrayList<>(); while ((term = itr.next()) != null) { String termText = term.utf8ToString(); terms.add(termText); } int n = terms.size(); for (int i = 3; i < n; i++) { String[] temp = new String[4]; temp[0] = terms.get(i - 3); temp[1] = terms.get(i - 2); temp[2] = terms.get(i - 1); temp[3] = terms.get(i); res.add(temp); } return res; }
From source file:lux.IndexTestSupport.java
License:Mozilla Public License
public static void printAllTerms(Directory dir, XmlIndexer indexer) throws IOException { DirectoryReader reader = DirectoryReader.open(dir); Fields fields = MultiFields.getFields(reader); System.out.println("Printing all terms (except uri)"); String uriFieldName = indexer.getConfiguration().getFieldName(FieldRole.URI); for (String field : fields) { if (field.equals(uriFieldName)) { continue; }/*from w ww .j a va 2 s. c o m*/ Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(null); BytesRef text; int count = 0; while ((text = termsEnum.next()) != null && count++ < 100) { System.out.println(field + " " + text.utf8ToString() + ' ' + termsEnum.docFreq()); } } reader.close(); }
From source file:nicta.com.au.patent.pac.index.TermFreqVector.java
public TermFreqVector(Terms terms) throws IOException { TermFreqVector = new HashMap<>(); if (terms != null && terms.size() > 0) { TermsEnum termsEnum = terms.iterator(null); // access the terms for this field BytesRef term; // System.out.println("--------"); while ((term = termsEnum.next()) != null) {// explore the terms for this field DocsAndPositionsEnum docsPosEnum = termsEnum.docsAndPositions(null, null); docsPosEnum.nextDoc();/*from w w w . j a v a 2 s . c om*/ TermFreqVector.put(term.utf8ToString(), docsPosEnum.freq()); // System.out.print(term.utf8ToString() + " " + docsPosEnum.freq() + " positions: "); //get the term frequency in the document for (int j = 0; j < docsPosEnum.freq(); j++) { // System.out.print(docsPosEnum.nextPosition() + " "); } // System.out.println(""); // System.out.print(term.utf8ToString()+" "); } // System.out.println(""); // System.out.println("----------"); } }
From source file:org.alfresco.solr.AlfrescoFieldType.java
License:Open Source License
@Override public Object toObject(SchemaField sf, BytesRef term) { return term.utf8ToString(); }
From source file:org.allenai.blacklab.queryParser.lucene.QueryParserBase.java
License:Apache License
/** * Builds a new {@link TermRangeQuery} instance * @param field Field//from ww w .j av a2s .co m * @param part1 min * @param part2 max * @param startInclusive true if the start of the range is inclusive * @param endInclusive true if the end of the range is inclusive * @return new {@link TermRangeQuery} instance */ protected TextPattern newRangeQuery(String field, String part1, String part2, boolean startInclusive, boolean endInclusive) { final BytesRef start; final BytesRef end; if (part1 == null) { start = null; } else { start = analyzeRangeTerms ? analyzeMultitermTerm(field, part1) : new BytesRef(part1); } if (part2 == null) { end = null; } else { end = analyzeRangeTerms ? analyzeMultitermTerm(field, part2) : new BytesRef(part2); } // BL was: TermRangeQuery final TPTermRange query = new TPTermRange(field, start.utf8ToString(), end.utf8ToString(), startInclusive, endInclusive); //query.setRewriteMethod(multiTermRewriteMethod); // BL disabled return query; }