List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:org.apache.solr.util.NumberUtils.java
License:Apache License
public static String SortableStr2long(BytesRef val) { // TODO: operate directly on BytesRef return SortableStr2long(val.utf8ToString()); }
From source file:org.apache.solr.util.NumberUtils.java
License:Apache License
public static float SortableStr2float(BytesRef val) { // TODO: operate directly on BytesRef return SortableStr2float(val.utf8ToString()); }
From source file:org.apache.solr.util.NumberUtils.java
License:Apache License
public static double SortableStr2double(BytesRef val) { // TODO: operate directly on BytesRef return SortableStr2double(val.utf8ToString()); }
From source file:org.apache.solr.util.NumberUtils.java
License:Apache License
public static int SortableStr2int(BytesRef sval, int offset, int len) { // TODO: operate directly on BytesRef return SortableStr2int(sval.utf8ToString(), offset, len); }
From source file:org.apache.solr.util.NumberUtils.java
License:Apache License
public static long SortableStr2long(BytesRef sval, int offset, int len) { // TODO: operate directly on BytesRef return SortableStr2long(sval.utf8ToString(), offset, len); }
From source file:org.apache.tika.eval.tokens.LuceneTokenCounter.java
License:Apache License
void count(String field) throws IOException { long tokenCount = leafReader.getSumTotalTermFreq(field); if (tokenCount > Integer.MAX_VALUE) { throw new IllegalArgumentException("can't handle longs"); }//from ww w . j ava 2 s .c om int tokenCountInt = (int) tokenCount; int uniqueTokenCount = 0; SummaryStatistics summStats = new SummaryStatistics(); double ent = 0.0d; double p = 0.0d; double base = 2.0; Terms terms = leafReader.terms(field); if (terms == null) { //if there were no terms fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, new TokenIntPair[0], ent, summStats)); return; } TermsEnum termsEnum = terms.iterator(); BytesRef bytesRef = termsEnum.next(); TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN); while (bytesRef != null) { long termFreq = termsEnum.totalTermFreq(); if (termFreq > Integer.MAX_VALUE) { throw new IllegalArgumentException("Sorry can't handle longs yet"); } int tf = (int) termFreq; //TODO: figure out how to avoid Stringifying this //to get codepoint count String t = bytesRef.utf8ToString(); int len = t.codePointCount(0, t.length()); for (int i = 0; i < tf; i++) { summStats.addValue(len); } p = (double) tf / (double) tokenCount; ent += p * FastMath.log(base, p); if (queue.top() == null || queue.size() < topN || tf >= queue.top().getValue()) { queue.insertWithOverflow(new TokenIntPair(t, tf)); } uniqueTokenCount++; bytesRef = termsEnum.next(); } if (tokenCountInt > 0) { ent = (-1.0d / (double) tokenCountInt) * ent; } fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, queue.getArray(), ent, summStats)); }
From source file:org.apache.tika.eval.tools.TopCommonTokenCounter.java
License:Apache License
private void execute(Path inputFile, Path commonTokensFile) throws Exception { Path luceneDir = Files.createTempDirectory("tika-eval-lucene-"); AbstractTokenTFDFPriorityQueue queue = new TokenDFPriorityQueue(TOP_N); try {//from ww w . ja v a 2s. c o m Directory directory = FSDirectory.open(luceneDir); AnalyzerManager analyzerManager = AnalyzerManager.newInstance(-1); Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer(); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); int maxLen = 1000000; int len = 0; try (IndexWriter writer = new IndexWriter(directory, indexWriterConfig)) { List<Document> docs = new ArrayList<>(); try (BufferedReader reader = getReader(inputFile)) { String line = reader.readLine(); while (line != null) { len += line.length(); Document document = new Document(); document.add(new TextField(FIELD, line, Field.Store.NO)); docs.add(document); if (len > maxLen) { writer.addDocuments(docs); docs.clear(); len = 0; } line = reader.readLine(); } } if (docs.size() > 0) { writer.addDocuments(docs); } writer.commit(); writer.flush(); } try (IndexReader reader = DirectoryReader.open(directory)) { LeafReader wrappedReader = SlowCompositeReaderWrapper.wrap(reader); Terms terms = wrappedReader.terms(FIELD); TermsEnum termsEnum = terms.iterator(); BytesRef bytesRef = termsEnum.next(); int docsWThisField = wrappedReader.getDocCount(FIELD); while (bytesRef != null) { int df = termsEnum.docFreq(); long tf = termsEnum.totalTermFreq(); if (MIN_DOC_FREQ > -1 && df < MIN_DOC_FREQ) { bytesRef = termsEnum.next(); continue; } if (queue.top() == null || queue.size() < TOP_N || df >= queue.top().df) { String t = bytesRef.utf8ToString(); if (!WHITE_LIST.contains(t) && !BLACK_LIST.contains(t)) { queue.insertWithOverflow(new TokenDFTF(t, df, tf)); } } bytesRef = termsEnum.next(); } } } finally { FileUtils.deleteDirectory(luceneDir.toFile()); } writeTopN(commonTokensFile, queue); }
From source file:org.buzzinate.lezhi.query.LezhiTermsEnum.java
License:Apache License
protected AcceptStatus accept(BytesRef term) throws IOException { System.out.println(term.utf8ToString() + ", docfreq=" + docFreq()); if (StringHelper.startsWith(term, prefixRef)) { return AcceptStatus.YES; } else {/* w w w .ja v a 2s . c om*/ return AcceptStatus.END; } }
From source file:org.codelibs.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude.java
License:Apache License
private static SortedSet<BytesRef> parseForDocValues(SortedSet<BytesRef> endUserFormattedValues, DocValueFormat format) {//w w w. ja v a 2 s . c o m SortedSet<BytesRef> result = endUserFormattedValues; if (endUserFormattedValues != null) { if (format != DocValueFormat.RAW) { result = new TreeSet<>(); for (BytesRef formattedVal : endUserFormattedValues) { result.add(format.parseBytesRef(formattedVal.utf8ToString())); } } } return result; }
From source file:org.codelibs.elasticsearch.search.aggregations.bucket.terms.support.IncludeExclude.java
License:Apache License
public LongFilter convertToLongFilter(DocValueFormat format) { if (isPartitionBased()) { return new PartitionedLongFilter(); }// www . ja v a2 s . c om int numValids = includeValues == null ? 0 : includeValues.size(); int numInvalids = excludeValues == null ? 0 : excludeValues.size(); SetBackedLongFilter result = new SetBackedLongFilter(numValids, numInvalids); if (includeValues != null) { for (BytesRef val : includeValues) { result.addAccept(format.parseLong(val.utf8ToString(), false, null)); } } if (excludeValues != null) { for (BytesRef val : excludeValues) { result.addReject(format.parseLong(val.utf8ToString(), false, null)); } } return result; }