List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:game.TermFreq.java
void loadTfVec() throws Exception { IndexReader reader = retriever.getReader(); long sumDf = reader.getSumDocFreq(TrecDocRetriever.FIELD_ANALYZED_CONTENT); Terms terms = reader.getTermVector(luceneDocIdToGuess, FIELD_ANALYZED_CONTENT); if (terms == null || terms.size() == 0) return;//from w ww.j av a 2 s. c o m TermsEnum termsEnum; BytesRef term; tfvec = new ArrayList<>(); // Construct the normalized tf vector termsEnum = terms.iterator(null); // access the terms for this field int doclen = 0; while ((term = termsEnum.next()) != null) { // explore the terms for this field String termStr = term.utf8ToString(); String stem = retriever.analyze(termStr); DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { //get the term frequency in the document int tf = docsEnum.freq(); TermFreq tfq = new TermFreq(new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, term), termStr, tf); tfvec.add(tfq); doclen += tf; } } for (TermFreq tf : tfvec) { tf.tf = tf.tf / (float) doclen; // normalize by len float idf = sumDf / reader.docFreq(tf.term); tf.wt = (float) (Math.log(1 + LAMBDA / (ONE_MINUS_LAMBDA) * tf.tf * idf)); } Collections.sort(tfvec); }
From source file:indexer.OptimizedRealValuedVecIndexer.java
void processAllDocumentWise() throws Exception { Cell cell, requantizedCell;/* w w w. ja v a 2 s . c om*/ BytesRef term; Terms tfvector; TermsEnum termsEnum; Set<Cell> splitCells = new HashSet<>(); int numDocs = reader.numDocs(); for (int i = 0; i < numDocs; i++) { tfvector = reader.getTermVector(i, DocVector.FIELD_CELL_ID); termsEnum = tfvector.iterator(); // access the terms for this field StringBuffer requantizedCells = new StringBuffer(); DocVector p = new DocVector(reader.document(i), numDimensions, numIntervals, null); // iterate for each cell in this document while ((term = termsEnum.next()) != null) { // explore the terms for this field String cellId = term.utf8ToString(); // one cell docName cell = new Cell(cellId); if (cell.toSplit(reader)) { // do we need to requantize? splitCells.add(cell); // mark this cell requantizedCell = cell.quantize(p); // this function returns a new object System.out.println("Cell " + cell + " updated to " + requantizedCell); requantizedCells.append(requantizedCell).append(" "); } else { requantizedCells.append(cell).append(" "); } } p.setQuantized(requantizedCells.toString()); writer.addDocument(p.constructDoc()); } saveSplitCells(writer, new ArrayList<Cell>(splitCells)); reader.close(); writer.close(); }
From source file:indextranslator.BOWTranslator.java
public void translate(String docIdStr, int docId) throws Exception { String termText;// w w w.j av a2 s. co m BytesRef term; Terms tfvector; TermsEnum termsEnum; int tf; tfvector = reader.getTermVector(docId, TextDocIndexer.FIELD_ANALYZED_CONTENT); if (tfvector == null || tfvector.size() == 0) return; // Construct the normalized tf vector termsEnum = tfvector.iterator(); // access the terms for this field StringBuffer buff = new StringBuffer(); while ((term = termsEnum.next()) != null) { // explore the terms for this field tf = (int) termsEnum.totalTermFreq(); termText = term.utf8ToString(); buff.append(dict.getTranslations(termText, tf)).append("\n"); } Document doc = constructDoc(docIdStr, buff.toString()); writer.addDocument(doc); }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
private void dumpPostings(IndexReader reader) throws IOException { // This is how you iterate through terms in the postings list. LeafReader leafReader = reader.leaves().get(0).reader(); TermsEnum termsEnum = leafReader.terms("text").iterator(); BytesRef bytesRef = termsEnum.next(); while (bytesRef != null) { // This is the current term in the dictionary. String token = bytesRef.utf8ToString(); Term term = new Term("text", token); System.out.print(token + " (df = " + reader.docFreq(term) + "):"); PostingsEnum postingsEnum = leafReader.postings(term); while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { System.out.print(String.format(" (%s, %s)", postingsEnum.docID(), postingsEnum.freq())); }/*ww w. j av a2 s .c o m*/ System.out.println(""); bytesRef = termsEnum.next(); } }
From source file:io.anserini.rerank.lib.Rm3Reranker.java
License:Apache License
private FeatureVector createdFeatureVector(Terms terms, IndexReader reader, boolean tweetsearch) { FeatureVector f = new FeatureVector(); try {/*from w w w . j a v a2 s.c o m*/ int numDocs = reader.numDocs(); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); if (term.length() < 2 || term.length() > 20) continue; if (!term.matches("[a-z0-9]+")) continue; // This seemingly arbitrary logic needs some explanation. See following PR for details: // https://github.com/castorini/Anserini/pull/289 // // We have long known that stopwords have a big impact in RM3. If we include stopwords // in feedback, effectiveness is affected negatively. In the previous implementation, we // built custom stopwords lists by selecting top k terms from the collection. We only // had two stopwords lists, for gov2 and for Twitter. The gov2 list is used on all // collections other than Twitter. // // The logic below instead uses a df threshold: If a term appears in more than n percent // of the documents, then it is discarded as a feedback term. This heuristic has the // advantage of getting rid of collection-specific stopwords lists, but at the cost of // introducing an additional tuning parameter. // // Cognizant of the dangers of (essentially) tuning on test data, here's what I // (@lintool) did: // // + For newswire collections, I picked a number, 10%, that seemed right. This value // actually increased effectiveness in most conditions across all newswire collections. // // + This 10% value worked fine on web collections; effectiveness didn't change much. // // Since this was the first and only heuristic value I selected, we're not really tuning // parameters. // // The 10% threshold, however, doesn't work well on tweets because tweets are much // shorter. Based on a list terms in the collection by df: For the Tweets2011 collection, // I found a threshold close to a nice round number that approximated the length of the // current stopwords list, by eyeballing the df values. This turned out to be 1%. I did // this again for the Tweets2013 collection, using the same approach, and obtained a value // of 0.7%. // // With both values, we obtained effectiveness pretty close to the old values with the // custom stopwords list. int df = reader.docFreq(new Term(FIELD_BODY, term)); float ratio = (float) df / numDocs; if (tweetsearch) { if (numDocs > 100000000) { // Probably Tweets2013 if (ratio > 0.007f) continue; } else { if (ratio > 0.01f) continue; } } else if (ratio > 0.1f) continue; int freq = (int) termsEnum.totalTermFreq(); f.addFeatureWeight(term, (float) freq); } } catch (Exception e) { e.printStackTrace(); // Return empty feature vector return f; } return f; }
From source file:io.anserini.util.ExtractTopDfTerms.java
License:Apache License
public static void main(String[] args) throws Exception { Args myArgs = new Args(); CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90)); try {/*from w w w . jav a 2 s . c om*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); parser.printUsage(System.err); System.err.println("Example: ExtractTopDfTerms" + parser.printExample(OptionHandlerFilter.REQUIRED)); return; } Directory dir = FSDirectory.open(Paths.get(myArgs.index)); IndexReader reader = DirectoryReader.open(dir); int numDocs = reader.numDocs(); Comparator<Pair> comp = new Comparator<Pair>() { @Override public int compare(Pair p1, Pair p2) { if (p1.value == p2.value) { return p1.key.compareTo(p2.key); } else return (p1.value < p2.value) ? -1 : 1; } }; PriorityQueue<Pair> queue = new PriorityQueue<Pair>(myArgs.topK, comp); LOG.info("Starting to iterate through all terms..."); Terms terms = MultiFields.getFields(reader).terms(myArgs.field); TermsEnum termsEnum = terms.iterator(); BytesRef text; int cnt = 0; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); if (term.length() == 0) continue; Pair p = new Pair(term, reader.docFreq(new Term(myArgs.field, term))); if (queue.size() < myArgs.topK) { queue.add(p); } else { if (comp.compare(p, queue.peek()) > 0) { queue.poll(); queue.add(p); } } cnt++; if (cnt % 1000000 == 0) { LOG.info("At term " + term); } } PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output))); Pair pair; while ((pair = queue.poll()) != null) { out.println(pair.key + "\t" + pair.value + "\t" + numDocs + "\t" + ((float) pair.value / numDocs)); } out.close(); LOG.info("Done!"); }
From source file:io.crate.analyze.NumberOfReplicasTest.java
License:Apache License
@Test public void testFromEmptySettings() throws Exception { BytesRef numberOfResplicas = NumberOfReplicas.fromSettings(Settings.EMPTY); assertThat(numberOfResplicas.utf8ToString(), is("1")); }
From source file:io.crate.analyze.NumberOfReplicasTest.java
License:Apache License
@Test public void testNumber() throws Exception { BytesRef numberOfResplicas = NumberOfReplicas .fromSettings(Settings.builder().put(NumberOfReplicas.NUMBER_OF_REPLICAS, 4).build()); assertThat(numberOfResplicas.utf8ToString(), is("4")); }
From source file:io.crate.analyze.NumberOfReplicasTest.java
License:Apache License
@Test public void testAutoExpandSettingsTakePrecedence() throws Exception { BytesRef numberOfResplicas = NumberOfReplicas .fromSettings(Settings.builder().put(NumberOfReplicas.AUTO_EXPAND_REPLICAS, "0-all") .put(NumberOfReplicas.NUMBER_OF_REPLICAS, 1).build()); assertThat(numberOfResplicas.utf8ToString(), is("0-all")); }
From source file:io.crate.core.NumberOfReplicasTest.java
License:Apache License
@Test public void testFromEmptySettings() throws Exception { BytesRef numberOfResplicas = NumberOfReplicas.fromSettings(ImmutableSettings.EMPTY); assertThat(numberOfResplicas.utf8ToString(), is("1")); }