List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.LuceneNGramCPFE.java
License:Apache License
private FrequencyDistribution<String> getTopNgramsCombo(int topNgramThreshold, String fieldName) throws ResourceInitializationException { FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>(); MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create(); IndexReader reader;/*from w w w .j a v a 2 s .c o m*/ try { reader = DirectoryReader.open(FSDirectory.open(luceneDir)); Fields fields = MultiFields.getFields(reader); if (fields != null) { Terms terms = fields.terms(fieldName); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); long freq = termsEnum.totalTermFreq(); //add conditions here, like ngram1 is in most freq ngrams1... String combo1 = term.split(ComboUtils.JOINT)[0]; String combo2 = term.split(ComboUtils.JOINT)[1]; int combinedSize = combo1.split("_").length + combo2.split("_").length; if (topKSetView1.contains(combo1) && topKSet.contains(combo1) && topKSetView2.contains(combo2) && topKSet.contains(combo2) && combinedSize <= ngramMaxNCombo && combinedSize >= ngramMinNCombo) { //print out here for testing topN.add(new TermFreqTuple(term, freq)); } } } } } catch (Exception e) { throw new ResourceInitializationException(e); } int size = topN.size(); for (int i = 0; i < size; i++) { TermFreqTuple tuple = topN.poll(); // System.out.println(tuple.getTerm() + " - " + tuple.getFreq()); topNGrams.addSample(tuple.getTerm(), tuple.getFreq()); } return topNGrams; }
From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.LuceneNGramPFE.java
License:Apache License
private FrequencyDistribution<String> getTopNgrams(int topNgramThreshold, String fieldName) throws ResourceInitializationException { FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>(); MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create(); IndexReader reader;/*from w w w .j ava 2s. c o m*/ try { reader = DirectoryReader.open(FSDirectory.open(luceneDir)); Fields fields = MultiFields.getFields(reader); if (fields != null) { Terms terms = fields.terms(fieldName); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); long freq = termsEnum.totalTermFreq(); topN.add(new TermFreqTuple(term, freq)); } } } } catch (Exception e) { throw new ResourceInitializationException(e); } int size = topN.size(); for (int i = 0; i < size; i++) { TermFreqTuple tuple = topN.poll(); // System.out.println(tuple.getTerm() + " - " + tuple.getFreq()); topNGrams.addSample(tuple.getTerm(), tuple.getFreq()); } return topNGrams; }
From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramCPMetaCollectorTest.java
License:Apache License
@Test public void combinedNgramPairMetaCollectorTest() throws Exception { File tmpDir = folder.newFolder(); CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt"); AnalysisEngineDescription segmenter = AnalysisEngineFactory .createEngineDescription(BreakIteratorSegmenter.class); AggregateBuilder builder = new AggregateBuilder(); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO); AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription( LuceneNGramCPMetaCollector.class, LuceneNGramCPFE.PARAM_LUCENE_DIR, tmpDir); // test fails if for-loop removed for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) { // System.out.println(jcas.getDocumentText().length()); }/*from w ww . ja v a 2 s .com*/ int i = 0; IndexReader index; try { index = DirectoryReader.open(FSDirectory.open(tmpDir)); Fields fields = MultiFields.getFields(index); if (fields != null) { Terms terms = fields.terms(LuceneNGramCPFE.LUCENE_NGRAM_FIELDCOMBO); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { // System.out.println(text.utf8ToString() + " - " + // termsEnum.totalTermFreq()); // System.out.println(termsEnum.docFreq()); // if there were multiple instances of the same ngram, // then this would be relevant if (text.utf8ToString().equals("mice_ANDcats_.")) { assertEquals(1, termsEnum.docFreq()); assertEquals(1, termsEnum.totalTermFreq()); } i++; } } } } catch (Exception e) { throw new ResourceInitializationException(e); } assertEquals(65, i); }
From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramPMetaCollectorTest.java
License:Apache License
@Test public void lucenePairNgramMetaCollectorTest() throws Exception { File tmpDir = folder.newFolder(); CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt"); AnalysisEngineDescription segmenter = AnalysisEngineFactory .createEngineDescription(BreakIteratorSegmenter.class); AggregateBuilder builder = new AggregateBuilder(); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO); AnalysisEngineDescription metaCollector = AnalysisEngineFactory .createEngineDescription(LuceneNGramPMetaCollector.class, LuceneNGramPFE.PARAM_LUCENE_DIR, tmpDir); // test fails if for-loop removed for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) { // System.out.println(jcas.getDocumentText().length()); }/*w w w . j av a 2s . co m*/ int i = 0; IndexReader index; try { index = DirectoryReader.open(FSDirectory.open(tmpDir)); Fields fields = MultiFields.getFields(index); if (fields != null) { Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { // System.out.println(text.utf8ToString() + " - " + // termsEnum.totalTermFreq()); // System.out.println(termsEnum.docFreq()); if (text.utf8ToString().equals("this")) { assertEquals(2, termsEnum.docFreq()); assertEquals(3, termsEnum.totalTermFreq()); } i++; } } } } catch (Exception e) { throw new ResourceInitializationException(e); } assertEquals(16, i); }
From source file:de.unihildesheim.iw.cli.DumpCommonTerms.java
License:Open Source License
/** * Class setup./* w w w . j a va 2s . c om*/ * * @param args Commandline arguments. * @throws IOException Thrown on low-level i/o-errors */ @SuppressWarnings("ObjectAllocationInLoop") @SuppressFBWarnings("SEC_SIDE_EFFECT_CONSTRUCTOR") private void runMain(final String... args) throws IOException { new CmdLineParser(this.cliParams); parseWithHelp(this.cliParams, args); // check, if files and directories are sane this.cliParams.check(); LOG.info("Writing terms to '{}'.", this.cliParams.targetFile); assert this.cliParams.lang != null; assert this.cliParams.stopFilePattern != null; final Set<String> sWords = CliCommon.getStopwords(this.cliParams.lang, this.cliParams.stopFileFormat, this.cliParams.stopFilePattern); assert this.cliParams.idxReader != null; final int maxDoc = this.cliParams.idxReader.maxDoc(); if (maxDoc == 0) { LOG.error("Empty index."); return; } final Terms terms = MultiFields.getTerms(this.cliParams.idxReader, this.cliParams.field); TermsEnum termsEnum = TermsEnum.EMPTY; BytesRef term; if (terms != null) { termsEnum = terms.iterator(termsEnum); term = termsEnum.next(); assert this.cliParams.targetFile != null; try (CSVWriter csvWriter = new CSVWriter(new OutputStreamWriter( new FileOutputStream(this.cliParams.targetFile), StandardCharsets.UTF_8))) { // write header line csvWriter.writeNext(new String[] { "term", "relDF" }); while (term != null) { final String termStr = term.utf8ToString(); if (!sWords.contains(termStr.toLowerCase())) { final double docFreq = (double) termsEnum.docFreq(); if (docFreq > 0d) { final double relDocFreq = docFreq / (double) maxDoc; if (relDocFreq > this.cliParams.threshold) { // log term csvWriter.writeNext(new String[] { termStr, // make exponential string R compatible Double.toString(relDocFreq).toLowerCase() }); } } } term = termsEnum.next(); } } } }
From source file:de.unihildesheim.iw.cli.DumpIPCs.java
License:Open Source License
private void runMain(final String... args) throws IOException, BuildException { new CmdLineParser(this.cliParams); parseWithHelp(this.cliParams, args); // check, if files and directories are sane this.cliParams.check(); assert this.cliParams.idxReader != null; final int maxDoc = this.cliParams.idxReader.maxDoc(); if (maxDoc == 0) { LOG.error("Empty index."); return;/*w w w . j a v a 2s .co m*/ } final Parser ipcParser = new Parser(); ipcParser.separatorChar(this.cliParams.sep); ipcParser.allowZeroPad(this.cliParams.zeroPad); final DirectoryReader reader = DirectoryReader.open(FSDirectory.open(this.cliParams.idxDir.toPath())); final Builder idxReaderBuilder = new Builder(reader); Pattern rx_ipc = null; if (this.cliParams.ipc != null) { final IPCRecord ipc = ipcParser.parse(this.cliParams.ipc); final BooleanQuery bq = new BooleanQuery(); rx_ipc = Pattern.compile(ipc.toRegExpString(this.cliParams.sep)); if (LOG.isDebugEnabled()) { LOG.debug("IPC regExp: rx={} pat={}", ipc.toRegExpString(this.cliParams.sep), rx_ipc); } bq.add(new QueryWrapperFilter(IPCClassQuery.get(ipc, this.cliParams.sep)), Occur.MUST); bq.add(new QueryWrapperFilter( new IPCFieldFilter(new IPCFieldFilterFunctions.SloppyMatch(ipc), ipcParser)), Occur.MUST); idxReaderBuilder.queryFilter(new QueryWrapperFilter(bq)); } final IndexReader idxReader = idxReaderBuilder.build(); if (idxReader.numDocs() > 0) { final Terms terms = MultiFields.getTerms(idxReader, LUCENE_CONF.FLD_IPC); TermsEnum termsEnum = TermsEnum.EMPTY; BytesRef term; if (terms != null) { termsEnum = terms.iterator(termsEnum); term = termsEnum.next(); final int[] count = { 0, 0 }; // match, exclude while (term != null) { final String code = term.utf8ToString(); if (rx_ipc == null || (rx_ipc.matcher(code).matches())) { final IPCRecord record = ipcParser.parse(code); try { System.out.println(code + ' ' + record + " (" + record.toFormattedString() + ") " + '[' + record.toRegExpString('-') + ']'); } catch (final IllegalArgumentException e) { System.out.println(code + ' ' + "INVALID (" + code + ')'); } count[0]++; } else { if (LOG.isDebugEnabled()) { LOG.debug("Skip non matching IPC: {}", code); } count[1]++; } term = termsEnum.next(); } LOG.info("match={} skip={}", count[0], count[1]); } } else { LOG.info("No documents left after filtering."); } }
From source file:de.unihildesheim.iw.cli.DumpTermData.java
License:Open Source License
/** * Class setup./* w w w . j a v a 2 s. co m*/ * * @param args Commandline arguments. * @throws IOException Thrown on low-level i/o-errors * @throws ClassNotFoundException Thrown if JDBC driver could not be loaded * @throws SQLException Thrown, if connection to the database has failed * @throws BuildException Thrown, if building a {@link * FilteredDirectoryReader} instance has failed */ @SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess") private void runMain(final String... args) throws IOException, SQLException, ClassNotFoundException, BuildException { new CmdLineParser(this.cliParams); parseWithHelp(this.cliParams, args); // check, if files and directories are sane this.cliParams.check(); LOG.info("Writing term-data to '{}'.", this.cliParams.dbFile); // table manager instance: Target database for term data try (final TermDataDB db = new TermDataDB(this.cliParams.dbFile)) { // create meta & data table final Table termsTable; if (this.cliParams.ipcRec == null) { termsTable = new TermsTable(); } else { termsTable = new TermsTable( // include optional IPC field TermsTable.FieldsOptional.IPC); } final Table metaTable = new MetaTable(); db.createTables(termsTable, metaTable); try (final TermsTable.Writer dataWriter = new TermsTable.Writer(db.getConnection())) { // write meta-data try (final MetaTable.Writer metaWriter = new MetaTable.Writer(db.getConnection())) { metaWriter.addContent(new TableFieldContent(metaTable) .setValue(MetaTable.Fields.TABLE_NAME, termsTable.getName()) .setValue(MetaTable.Fields.CMD, StringUtils.join(args, " "))); } final Set<String> sWords; if (this.cliParams.stopFilePattern != null) { sWords = CliCommon.getStopwords(this.cliParams.lang, this.cliParams.stopFileFormat, this.cliParams.stopFilePattern); } else { sWords = Collections.emptySet(); } final int maxDoc = this.cliParams.idxReader.maxDoc(); if (maxDoc == 0) { LOG.error("Empty index."); return; } final Terms terms = MultiFields.getTerms(this.cliParams.idxReader, this.cliParams.field); TermsEnum termsEnum = TermsEnum.EMPTY; BytesRef term; if (terms != null) { termsEnum = terms.iterator(termsEnum); term = termsEnum.next(); final AtomicLong count = new AtomicLong(0L); @SuppressWarnings("AnonymousInnerClassMayBeStatic") final TaskObserver obs = new TaskObserver(new TaskObserverMessage() { @Override public void call(@NotNull final TimeMeasure tm) { LOG.info("Collected {} terms after {}.", NumberFormat.getIntegerInstance().format(count.get()), tm.getTimeString()); } }).start(); // normalize some parameters final String fieldName = StringUtils.lowerCase(this.cliParams.field); final String langName = StringUtils.lowerCase(this.cliParams.lang); while (term != null) { final String termStr = term.utf8ToString(); if (!sWords.contains(termStr.toLowerCase())) { final double docFreq = (double) termsEnum.docFreq(); if (docFreq > 0d) { final double relDocFreq = docFreq / (double) maxDoc; if (relDocFreq > this.cliParams.threshold) { @SuppressWarnings("ObjectAllocationInLoop") final TableFieldContent tfc = new TableFieldContent(termsTable); tfc.setValue(TermsTable.Fields.TERM, termStr); tfc.setValue(TermsTable.Fields.DOCFREQ_REL, relDocFreq); tfc.setValue(TermsTable.Fields.DOCFREQ_ABS, docFreq); tfc.setValue(TermsTable.Fields.LANG, langName); tfc.setValue(TermsTable.Fields.FIELD, fieldName); if (this.cliParams.ipcRec != null) { tfc.setValue(TermsTable.FieldsOptional.IPC, this.cliParams.ipcRec.toFormattedString()); } dataWriter.addContent(tfc, false); count.incrementAndGet(); } } } term = termsEnum.next(); } obs.stop(); LOG.info("Total of {} terms collected.", NumberFormat.getIntegerInstance().format(count)); } } } }
From source file:de.unihildesheim.iw.lucene.analyzer.EnglishAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" }) @Test//from w w w . jav a2 s. c om public void testTokenStream_noStopwords() throws Exception { final String query = "foo bar baz bam"; final Analyzer analyzer = new EnglishAnalyzer(); final BytesRefArray result = new BytesRefArray(Counter.newCounter(false)); try (TokenStream stream = analyzer.tokenStream(null, query)) { stream.reset(); while (stream.incrementToken()) { final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class)); if (term.length > 0) { result.append(term); } } } Assert.assertEquals("Not all terms returned.", 4L, result.size()); final BytesRefIterator bri = result.iterator(); BytesRef term; while ((term = bri.next()) != null) { Assert.assertTrue("Unknown term found.", "foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString()) || "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString())); } }
From source file:de.unihildesheim.iw.lucene.analyzer.EnglishAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" }) @Test/*from w ww . jav a 2s. c o m*/ public void testTokenStream() throws Exception { final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true); final String query = "foo bar baz bam"; final Analyzer analyzer = new EnglishAnalyzer(csa); final BytesRefArray result = new BytesRefArray(Counter.newCounter(false)); try (TokenStream stream = analyzer.tokenStream(null, query)) { stream.reset(); while (stream.incrementToken()) { final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class)); if (term.length > 0) { result.append(term); } } } Assert.assertEquals("Not all terms returned.", 2L, result.size()); final BytesRefIterator bri = result.iterator(); BytesRef term; while ((term = bri.next()) != null) { Assert.assertTrue("Unknown term found.", "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString())); } }
From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" }) @Test//from www . j a va2 s.co m public void testTokenStream_noStopwords() throws Exception { final String query = "foo bar baz bam"; final Analyzer analyzer = new FrenchAnalyzer(); final BytesRefArray result = new BytesRefArray(Counter.newCounter(false)); try (TokenStream stream = analyzer.tokenStream(null, query)) { stream.reset(); while (stream.incrementToken()) { final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class)); if (term.length > 0) { result.append(term); } } } Assert.assertEquals("Not all terms returned.", 4L, result.size()); final BytesRefIterator bri = result.iterator(); BytesRef term; while ((term = bri.next()) != null) { Assert.assertTrue("Unknown term found.", "foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString()) || "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString())); } }