List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:org.iis.plagiarismdetector.core.lucene.LowFreqTerms.java
/** * /*from w w w . j av a 2 s.c o m*/ * <code>fillQueue</code> is a function that fill given priority queue with * given object * * @param termsEnum * term enumerator that contains the terms those should be pushed * to the given queue * @param tiq * the priority queue * @param field * name of the index field which terms belong to * @throws Exception */ public void fillQueue(TermsEnum termsEnum, PriorityQueue<TermStats> tiq, String field) throws Exception { BytesRef term; while ((term = termsEnum.next()) != null) { if (term.utf8ToString().matches("[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?")) { System.out.println(term.utf8ToString()); } if ((termsEnum.docFreq() > 1) && (!term.utf8ToString().matches("[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"))) { BytesRef r = new BytesRef(); r.copyBytes(term); tiq.insertWithOverflow( new TermStats(field, r, termsEnum.docFreq(), this.getTotalTF_PerField(field, term))); } } }
From source file:org.languagetool.dev.archive.StartTokenCounter.java
License:Open Source License
public static void main(String[] args) throws IOException { long totalCount = 0; File dir = new File("/data/google-ngram-index/en/2grams"); try (FSDirectory directory = FSDirectory.open(dir.toPath()); IndexReader reader = DirectoryReader.open(directory)) { IndexSearcher searcher = new IndexSearcher(reader); Fields fields = MultiFields.getFields(reader); Terms ngrams = fields.terms("ngram"); TermsEnum iterator = ngrams.iterator(); BytesRef next; int i = 0; while ((next = iterator.next()) != null) { String term = next.utf8ToString(); if (term.startsWith(LanguageModel.GOOGLE_SENTENCE_START)) { if (term.matches(".*_(ADJ|ADV|NUM|VERB|ADP|NOUN|PRON|CONJ|DET|PRT)$")) { //System.out.println("ignore: " + term); continue; }//from w w w. j a v a2 s . co m TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 3); if (topDocs.totalHits == 0) { throw new RuntimeException("No hits for " + term + ": " + topDocs.totalHits); } else if (topDocs.totalHits == 1) { int docId = topDocs.scoreDocs[0].doc; Document document = reader.document(docId); Long count = Long.parseLong(document.get("count")); //System.out.println(term + " -> " + count); totalCount += count; if (++i % 10_000 == 0) { System.out.println(i + " ... " + totalCount); } } else { throw new RuntimeException( "More hits than expected for " + term + ": " + topDocs.totalHits); } } } } System.out.println("==> " + totalCount); }
From source file:org.languagetool.dev.bigdata.GermanUppercasePhraseFinder.java
License:Open Source License
public static void main(String[] args) throws IOException { if (args.length != 1) { System.out.println("Usage: " + GermanUppercasePhraseFinder.class.getSimpleName() + " <ngramIndexDir>"); System.exit(1);//from www.j av a2 s . co m } JLanguageTool lt = new JLanguageTool(Languages.getLanguageForShortCode("de")); FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath()); IndexReader reader = DirectoryReader.open(fsDir); IndexSearcher searcher = new IndexSearcher(reader); Fields fields = MultiFields.getFields(reader); Terms terms = fields.terms("ngram"); TermsEnum termsEnum = terms.iterator(); int count = 0; BytesRef next; while ((next = termsEnum.next()) != null) { String term = next.utf8ToString(); count++; //term = "persischer Golf"; // for testing String[] parts = term.split(" "); boolean useful = true; int lcCount = 0; List<String> ucParts = new ArrayList<>(); for (String part : parts) { if (part.length() < MIN_TERM_LEN) { useful = false; break; } String uc = StringTools.uppercaseFirstChar(part); if (!part.equals(uc)) { lcCount++; } ucParts.add(uc); } if (!useful || lcCount == 0 || lcCount == 2) { continue; } String uppercase = Strings.join(ucParts, " "); if (term.equals(uppercase)) { continue; } long thisCount = getOccurrenceCount(reader, searcher, term); long thisUpperCount = getOccurrenceCount(reader, searcher, uppercase); if (count % 10_000 == 0) { System.err.println(count + " @ " + term); } if (thisCount > LIMIT || thisUpperCount > LIMIT) { if (thisUpperCount > thisCount) { if (isRelevant(lt, term)) { float factor = (float) thisUpperCount / thisCount; System.out.printf( "%.2f " + thisUpperCount + " " + uppercase + " " + thisCount + " " + term + "\n", factor); } } } } }
From source file:org.languagetool.dev.bigdata.LargestNGramFinder.java
License:Open Source License
public static void main(String[] args) throws IOException { if (args.length != 1) { System.out.println("Usage: " + LargestNGramFinder.class.getSimpleName() + " <ngramIndexDir>"); System.exit(1);//from w w w .j a v a2s . co m } FSDirectory fsDir = FSDirectory.open(new File(args[0]).toPath()); IndexReader reader = DirectoryReader.open(fsDir); IndexSearcher searcher = new IndexSearcher(reader); Fields fields = MultiFields.getFields(reader); long max = 0; String maxTerm = ""; Terms terms = fields.terms("ngram"); TermsEnum termsEnum = terms.iterator(); int count = 0; BytesRef next; while ((next = termsEnum.next()) != null) { String term = next.utf8ToString(); TopDocs topDocs = searcher.search(new TermQuery(new Term("ngram", term)), 5); int docId = topDocs.scoreDocs[0].doc; Document document = reader.document(docId); long thisCount = Long.parseLong(document.get("count")); if (max < thisCount) { max = thisCount; maxTerm = term; } if (count % 10_000 == 0) { System.out.println(count + " -> " + topDocs.totalHits + " for " + term + " -> " + thisCount + ", max so far: " + max + " for '" + maxTerm + "'"); } count++; } System.out.println("Max: " + max + " for " + maxTerm); }
From source file:org.languagetool.dev.bigdata.NeededNGramCounter.java
License:Open Source License
public static void main(String[] args) throws IOException { if (args.length != 1) { System.out.println("Usage: " + NeededNGramCounter.class.getSimpleName() + " <ngramIndexDir>"); System.exit(1);/* w ww .j a va2 s .c o m*/ } Language lang = Languages.getLanguageForShortCode(LANG); String path = "/" + lang.getShortCode() + "/confusion_sets.txt"; Set<String> ngrams; try (InputStream confSetStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(path)) { ngrams = new ConfusionSetLoader().loadConfusionSet(confSetStream).keySet(); } String ngramIndexDir = args[0]; FSDirectory fsDir = FSDirectory.open(new File(ngramIndexDir).toPath()); IndexReader reader = DirectoryReader.open(fsDir); Fields fields = MultiFields.getFields(reader); Terms terms = fields.terms("ngram"); TermsEnum termsEnum = terms.iterator(); int i = 0; int needed = 0; int notNeeded = 0; BytesRef next; while ((next = termsEnum.next()) != null) { String term = next.utf8ToString(); String[] tmpTerms = term.split(" "); boolean ngramNeeded = false; for (String tmpTerm : tmpTerms) { if (ngrams.contains(tmpTerm)) { ngramNeeded = true; break; } } if (ngramNeeded) { //System.out.println("needed: " + term); needed++; } else { //System.out.println("not needed: " + term); notNeeded++; } if (i % 500_000 == 0) { System.out.println(i + "/" + terms.getDocCount()); } i++; } System.out.println("language : " + LANG); System.out.println("ngram index : " + ngramIndexDir); System.out.println("needed ngrams : " + needed); System.out.println("not needed ngrams: " + notNeeded); }
From source file:org.lexevs.dao.index.lucene.v2010.metadata.LuceneMetadataDao.java
License:Open Source License
@Override public AbsoluteCodingSchemeVersionReferenceList listCodingSchemes() { AbsoluteCodingSchemeVersionReferenceList result = new AbsoluteCodingSchemeVersionReferenceList(); try {//from w w w . ja v a2 s . co m final TermsEnum te = luceneIndexTemplate.executeInIndexReader(new IndexReaderCallback<TermsEnum>() { @Override public TermsEnum doInIndexReader(IndexReader indexReader) throws Exception { TermsEnum termsEnum = null; Fields fields = MultiFields.getFields(indexReader); if (fields != null) { Terms terms = fields.terms("codingSchemeNameVersion"); if (terms != null) { termsEnum = terms.iterator(); } } return termsEnum; } }); // TODO see Multifield for a better implementation of this. BytesRef text = null; while ((te != null) && (text = te.next()) != null) { Query temp = new TermQuery(new Term("codingSchemeNameVersion", text.utf8ToString())); List<ScoreDoc> d = this.luceneIndexTemplate.search(temp, null); if (d.size() > 0) { ScoreDoc doc = d.get(0); AbsoluteCodingSchemeVersionReference acsvr = new AbsoluteCodingSchemeVersionReference(); Document document = luceneIndexTemplate.getDocumentById(doc.doc); acsvr.setCodingSchemeURN(document.get("codingSchemeRegisteredName")); acsvr.setCodingSchemeVersion(document.get("codingSchemeVersion")); result.addAbsoluteCodingSchemeVersionReference(acsvr); } } return result; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:org.meresco.lucene.Lucene.java
License:Open Source License
public List<TermCount> termsForField(String field, String prefix, int limit) throws Exception { // if t == str: // convert = lambda term: term.utf8ToString() // elif t == int: // convert = lambda term: NumericUtils.prefixCodedToInt(term) // elif t == long: // convert = lambda term: NumericUtils.prefixCodedToLong(term) // elif t == float: // convert = lambda term: NumericUtils.sortableLongToDouble(NumericUtils.prefixCodedToLong(term)) SearcherAndTaxonomy reference = data.getManager().acquire(); try {/*from w w w .j a v a2 s . co m*/ List<TermCount> terms = new ArrayList<TermCount>(); IndexReader reader = reference.searcher.getIndexReader(); Terms termsEnum = MultiFields.getTerms(reader, field); if (termsEnum == null) return terms; TermsEnum iterator = termsEnum.iterator(null); if (prefix != null) { iterator.seekCeil(new BytesRef(prefix)); terms.add(new TermCount(iterator.term().utf8ToString(), iterator.docFreq())); } while (terms.size() < limit) { BytesRef next = iterator.next(); if (next == null) break; String term = next.utf8ToString(); if (prefix != null && !term.startsWith(prefix)) { break; } terms.add(new TermCount(term, iterator.docFreq())); } return terms; } finally { data.getManager().release(reference); } }
From source file:org.meresco.lucene.Lucene.java
License:Open Source License
public LuceneResponse similarDocuments(String identifier) throws Throwable { SearcherAndTaxonomy reference = data.getManager().acquire(); try {/*from ww w.j a va2s . c om*/ Query idQuery = new TermQuery(new Term(ID_FIELD, identifier)); TopDocs topDocs = reference.searcher.search(idQuery, 1); if (topDocs.totalHits == 0) return new LuceneResponse(0); int docId = topDocs.scoreDocs[0].doc; IndexReader reader = reference.searcher.getIndexReader(); CommonTermsQuery commonQuery = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD, 0.1f); Fields termVectors = reader.getTermVectors(docId); if (termVectors == null) return new LuceneResponse(0); for (String field : termVectors) { TermsEnum iterator = termVectors.terms(field).iterator(null); BytesRef b; while ((b = iterator.next()) != null) { Term term = new Term(field, b.utf8ToString()); commonQuery.add(term); } } BooleanQuery query = new BooleanQuery(); query.add(idQuery, Occur.MUST_NOT); query.add(commonQuery, Occur.MUST); return executeQuery(query); } finally { data.getManager().release(reference); } }
From source file:org.meresco.lucene.search.MerescoClusterer.java
License:Open Source License
private MerescoCluster rankCluster(List<MerescoVector> vectors) { PageRank pageRank = new PageRank(this.ords.size()); for (MerescoVector vector : vectors) { pageRank.add(vector.docId, vector.getPoint()); }/*w w w .ja v a 2s.c o m*/ pageRank.prepare(); for (int i = 0; i < 5; i++) pageRank.iterate(); MerescoCluster.DocScore[] topDocs = new MerescoCluster.DocScore[vectors.size()]; int i = 0; for (PageRank.Node n : pageRank.topDocs()) { topDocs[i++] = new MerescoCluster.DocScore(n.id, n.getPR()); } i = 0; List<Node> rankedTerms = pageRank.topTerms(); MerescoCluster.TermScore[] topTerms = new MerescoCluster.TermScore[rankedTerms.size()]; for (PageRank.Node n : rankedTerms) { BytesRef ref = new BytesRef(); this.ords.get(n.id, ref); topTerms[i++] = new MerescoCluster.TermScore(ref.utf8ToString(), n.getPR()); } return new MerescoCluster(topDocs, topTerms); }
From source file:org.meresco.lucene.search.MerescoVector.java
License:Open Source License
public void printVector(BytesRefHash hash) { Iterator iter = entries.iterator(); while (iter.hasNext()) { iter.advance();//from w ww.j a v a 2s. c om if (iter.value() > 0) { BytesRef b = new BytesRef(); hash.get(iter.key(), b); System.out.print(b.utf8ToString() + ":" + iter.value() + " "); } } System.out.println(); }