Example usage for org.apache.lucene.util BytesRef utf8ToString

List of usage examples for org.apache.lucene.util BytesRef utf8ToString

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef utf8ToString.

Prototype

public String utf8ToString() 

Source Link

Document

Interprets stored bytes as UTF8 bytes, returning the resulting string

Usage

From source file:com.twentyn.patentSearch.DocumentSearch.java

License:Open Source License

public static void main(String[] args) throws Exception {
    System.out.println("Starting up...");
    System.out.flush();/*from   w ww .  j a va  2 s. c  om*/
    Options opts = new Options();
    opts.addOption(Option.builder("x").longOpt("index").hasArg().required().desc("Path to index file to read")
            .build());
    opts.addOption(Option.builder("h").longOpt("help").desc("Print this help message and exit").build());
    opts.addOption(Option.builder("v").longOpt("verbose").desc("Print verbose log output").build());

    opts.addOption(Option.builder("f").longOpt("field").hasArg().desc("The indexed field to search").build());
    opts.addOption(
            Option.builder("q").longOpt("query").hasArg().desc("The query to use when searching").build());
    opts.addOption(Option.builder("l").longOpt("list-file").hasArg()
            .desc("A file containing a list of queries to run in sequence").build());
    opts.addOption(
            Option.builder("e").longOpt("enumerate").desc("Enumerate the documents in the index").build());
    opts.addOption(Option.builder("d").longOpt("dump").hasArg()
            .desc("Dump terms in the document index for a specified field").build());
    opts.addOption(
            Option.builder("o").longOpt("output").hasArg().desc("Write results JSON to this file.").build());
    opts.addOption(Option.builder("n").longOpt("inchi-field").hasArg()
            .desc("The index of the InChI field if an input TSV is specified.").build());
    opts.addOption(Option.builder("s").longOpt("synonym-field").hasArg()
            .desc("The index of the chemical synonym field if an input TSV is specified.").build());

    HelpFormatter helpFormatter = new HelpFormatter();
    CommandLineParser cmdLineParser = new DefaultParser();
    CommandLine cmdLine = null;
    try {
        cmdLine = cmdLineParser.parse(opts, args);
    } catch (ParseException e) {
        System.out.println("Caught exception when parsing command line: " + e.getMessage());
        helpFormatter.printHelp("DocumentIndexer", opts);
        System.exit(1);
    }

    if (cmdLine.hasOption("help")) {
        helpFormatter.printHelp("DocumentIndexer", opts);
        System.exit(0);
    }

    if (!(cmdLine.hasOption("enumerate") || cmdLine.hasOption("dump") || (cmdLine.hasOption("field")
            && (cmdLine.hasOption("query") || cmdLine.hasOption("list-file"))))) {
        System.out.println("Must specify one of 'enumerate', 'dump', or 'field' + {'query', 'list-file'}");
        helpFormatter.printHelp("DocumentIndexer", opts);
        System.exit(1);
    }

    if (cmdLine.hasOption("verbose")) {
        // With help from http://stackoverflow.com/questions/23434252/programmatically-change-log-level-in-log4j2
        LoggerContext ctx = (LoggerContext) LogManager.getContext(false);
        Configuration ctxConfig = ctx.getConfiguration();
        LoggerConfig logConfig = ctxConfig.getLoggerConfig(LogManager.ROOT_LOGGER_NAME);
        logConfig.setLevel(Level.DEBUG);

        ctx.updateLoggers();
        LOGGER.debug("Verbose logging enabled");
    }

    ObjectMapper objectMapper = new ObjectMapper();
    objectMapper.enable(SerializationFeature.INDENT_OUTPUT);
    objectMapper.setVisibility(PropertyAccessor.ALL, JsonAutoDetect.Visibility.ANY);

    LOGGER.info("Opening index at " + cmdLine.getOptionValue("index"));

    try (Directory indexDir = FSDirectory.open(new File(cmdLine.getOptionValue("index")).toPath());
            IndexReader indexReader = DirectoryReader.open(indexDir);) {
        if (cmdLine.hasOption("enumerate")) {
            /* Enumerate all documents in the index.
             * With help from
             * http://stackoverflow.com/questions/2311845/is-it-possible-to-iterate-through-documents-stored-in-lucene-index
             */
            for (int i = 0; i < indexReader.maxDoc(); i++) {
                Document doc = indexReader.document(i);
                LOGGER.info("Doc " + i + ":");
                LOGGER.info(doc);
            }
        } else if (cmdLine.hasOption("dump")) {
            /* Dump indexed terms for a specific field.
             * With help from http://stackoverflow.com/questions/11148036/find-list-of-terms-indexed-by-lucene */
            Terms terms = SlowCompositeReaderWrapper.wrap(indexReader).terms(cmdLine.getOptionValue("dump"));
            LOGGER.info("Has positions: " + terms.hasPositions());
            LOGGER.info("Has offsets:   " + terms.hasOffsets());
            LOGGER.info("Has freqs:     " + terms.hasFreqs());
            LOGGER.info("Stats:         " + terms.getStats());
            LOGGER.info(terms);
            TermsEnum termsEnum = terms.iterator();
            BytesRef br = null;
            while ((br = termsEnum.next()) != null) {
                LOGGER.info("  " + br.utf8ToString());
            }

        } else {
            IndexSearcher searcher = new IndexSearcher(indexReader);
            String field = cmdLine.getOptionValue("field");

            List<Pair<String, String>> queries = null;
            if (cmdLine.hasOption("query")) {
                queries = Collections.singletonList(Pair.of("", cmdLine.getOptionValue("query")));
            } else if (cmdLine.hasOption("list-file")) {
                if (!(cmdLine.hasOption("inchi-field") && cmdLine.hasOption("synonym-field"))) {
                    LOGGER.error("Must specify both inchi-field and synonym-field when using list-file.");
                    System.exit(1);
                }
                Integer inchiField = Integer.parseInt(cmdLine.getOptionValue("inchi-field"));
                Integer synonymField = Integer.parseInt(cmdLine.getOptionValue("synonym-field"));

                queries = new LinkedList<>();
                BufferedReader r = new BufferedReader(new FileReader(cmdLine.getOptionValue("list-file")));
                String line;
                while ((line = r.readLine()) != null) {
                    line = line.trim();
                    if (!line.isEmpty()) {
                        // TODO: use a proper TSV reader; this is intentionally terrible as is.
                        String[] fields = line.split("\t");
                        queries.add(Pair.of(fields[inchiField].replace("\"", ""), fields[synonymField]));
                    }
                }
                r.close();
            }

            if (queries == null || queries.size() == 0) {
                LOGGER.error("Found no queries to run.");
                return;
            }

            List<SearchResult> searchResults = new ArrayList<>(queries.size());
            for (Pair<String, String> queryPair : queries) {
                String inchi = queryPair.getLeft();
                String rawQueryString = queryPair.getRight();
                /* The Lucene query parser interprets the kind of structural annotations we see in chemical entities
                 * as query directives, which is not what we want at all.  Phrase queries seem to work adequately
                 * with the analyzer we're currently using. */
                String queryString = rawQueryString.trim().toLowerCase();
                String[] parts = queryString.split("\\s+");
                PhraseQuery query = new PhraseQuery();
                for (String p : parts) {
                    query.add(new Term(field, p));
                }
                LOGGER.info("Running query: " + query.toString());

                BooleanQuery bq = new BooleanQuery();
                bq.add(query, BooleanClause.Occur.MUST);
                bq.add(new TermQuery(new Term(field, "yeast")), BooleanClause.Occur.SHOULD);
                bq.add(new TermQuery(new Term(field, "ferment")), BooleanClause.Occur.SHOULD);
                bq.add(new TermQuery(new Term(field, "fermentation")), BooleanClause.Occur.SHOULD);
                bq.add(new TermQuery(new Term(field, "fermentive")), BooleanClause.Occur.SHOULD);
                bq.add(new TermQuery(new Term(field, "saccharomyces")), BooleanClause.Occur.SHOULD);

                LOGGER.info("  Full query: " + bq.toString());

                TopDocs topDocs = searcher.search(bq, 100);
                ScoreDoc[] scoreDocs = topDocs.scoreDocs;
                if (scoreDocs.length == 0) {
                    LOGGER.info("Search returned no results.");
                }
                List<ResultDocument> results = new ArrayList<>(scoreDocs.length);
                for (int i = 0; i < scoreDocs.length; i++) {
                    ScoreDoc scoreDoc = scoreDocs[i];
                    Document doc = indexReader.document(scoreDoc.doc);
                    LOGGER.info("Doc " + i + ": " + scoreDoc.doc + ", score " + scoreDoc.score + ": "
                            + doc.get("id") + ", " + doc.get("title"));
                    results.add(new ResultDocument(scoreDoc.doc, scoreDoc.score, doc.get("title"),
                            doc.get("id"), null));
                }
                LOGGER.info("----- Done with query " + query.toString());
                // TODO: reduce memory usage when not writing results to an output file.
                searchResults.add(new SearchResult(inchi, rawQueryString, bq, results));
            }

            if (cmdLine.hasOption("output")) {
                try (FileWriter writer = new FileWriter(cmdLine.getOptionValue("output"));) {
                    writer.write(objectMapper.writeValueAsString(searchResults));
                }
            }
        }
    }
}

From source file:CopulaResources.TermCooccurence.java

public static TermCooccurence generateCooccurencebyClass(IndexReader indexReader, String classFieldName,
        String textFieldName, Analyzer analyzer, int minFreq, int maxFreq) throws IOException {
    System.out.println(":::Generating Term-Pair List:::");
    TermCooccurence CooccurList = new TermCooccurence();
    Terms classes = MultiFields.getTerms(indexReader, classFieldName);
    if (classes != null) {
        TermsEnum classesEnum = classes.iterator();
        BytesRef nextClass;
        while ((nextClass = classesEnum.next()) != null) {
            if (nextClass.length > 0) {

                Term term = new Term(classFieldName, nextClass);
                String tpClass = nextClass.utf8ToString();
                BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();
                booleanQuery.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.MUST));
                IndexSearcher indexSearcher = new IndexSearcher(indexReader);
                TopDocs topDocs;//  w w  w .j a v a2 s .  com
                topDocs = indexSearcher.search(booleanQuery.build(), indexReader.numDocs());

                for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
                    IndexableField[] storableFields = indexSearcher.doc(scoreDoc.doc).getFields(textFieldName);
                    for (IndexableField singleStorableField : storableFields) {
                        if (singleStorableField != null) {
                            BytesRef text = new BytesRef(singleStorableField.stringValue());
                            generateCooccurences(text.utf8ToString(), analyzer, CooccurList, tpClass);
                        }
                    }
                }
                CooccurList.trimbyFreq(tpClass, minFreq, maxFreq);

            }
        }
    }
    System.out.println(":::Generation Complete:::");
    return CooccurList;
}

From source file:CopulaResources.TermCooccurence.java

public static void generateCooccurencebyClass(IndexReader indexReader, String classFieldName,
        String textFieldName, Analyzer analyzer, int minFreq, int maxFreq, Path saveDir) throws IOException {
    System.out.println(":::Generating Term-Pair List:::");
    TermCooccurence CooccurList = new TermCooccurence();
    Terms classes = MultiFields.getTerms(indexReader, classFieldName);
    if (classes != null) {
        TermsEnum classesEnum = classes.iterator();
        BytesRef nextClass;
        while ((nextClass = classesEnum.next()) != null) {
            if (nextClass.length > 0) {

                Term term = new Term(classFieldName, nextClass);
                String tpClass = nextClass.utf8ToString();
                BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();
                booleanQuery.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.MUST));
                IndexSearcher indexSearcher = new IndexSearcher(indexReader);
                TopDocs topDocs;/*from  w w w .  j  a  v a  2  s  . c o  m*/
                topDocs = indexSearcher.search(booleanQuery.build(), indexReader.numDocs());

                for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
                    IndexableField[] storableFields = indexSearcher.doc(scoreDoc.doc).getFields(textFieldName);
                    for (IndexableField singleStorableField : storableFields) {
                        if (singleStorableField != null) {
                            BytesRef text = new BytesRef(singleStorableField.stringValue());
                            generateCooccurences(text.utf8ToString(), analyzer, CooccurList, tpClass);
                        }
                    }
                }
                CooccurList.trimbyFreq(tpClass, minFreq, maxFreq);

            }
        }
    }
    CooccurList.savetoFile(saveDir);
    System.out.println(":::Generation Complete:::");
}

From source file:CopulaResources.TermCooccurence.java

public static void generateNCooccurencebyClass(IndexReader indexReader, String classFieldName,
        String textFieldName, Analyzer analyzer, String direction, double percent, Path saveDir)
        throws IOException {
    System.out.println(":::Generating Term-Pair List:::");
    TermCooccurence CooccurList = new TermCooccurence();
    Terms classes = MultiFields.getTerms(indexReader, classFieldName);
    if (classes != null) {
        TermsEnum classesEnum = classes.iterator();
        BytesRef nextClass;
        while ((nextClass = classesEnum.next()) != null) {
            if (nextClass.length > 0) {

                Term term = new Term(classFieldName, nextClass);
                String tpClass = nextClass.utf8ToString();
                BooleanQuery.Builder booleanQuery = new BooleanQuery.Builder();
                booleanQuery.add(new BooleanClause(new TermQuery(term), BooleanClause.Occur.MUST));
                IndexSearcher indexSearcher = new IndexSearcher(indexReader);
                TopDocs topDocs;/*ww  w  .  j a  va  2  s. c  o  m*/
                topDocs = indexSearcher.search(booleanQuery.build(), indexReader.numDocs());

                for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
                    IndexableField[] storableFields = indexSearcher.doc(scoreDoc.doc).getFields(textFieldName);
                    for (IndexableField singleStorableField : storableFields) {
                        if (singleStorableField != null) {
                            BytesRef text = new BytesRef(singleStorableField.stringValue());
                            generateCooccurences(text.utf8ToString(), analyzer, CooccurList, tpClass);
                        }
                    }
                }
                CooccurList.trimbyPercent(tpClass, direction, percent);

            }
        }
    }
    CooccurList.savetoFile(saveDir);
    System.out.println(":::Generation Complete:::");
}

From source file:de.blizzy.documentr.search.PageIndex.java

License:Open Source License

public Set<String> getAllTags(Authentication authentication) throws IOException, TimeoutException {
    IndexReader reader = null;/*from  w  w w  .  jav  a 2 s  . com*/
    IndexSearcher searcher = null;
    try {
        searcher = searcherManager.acquire();
        Bits visibleDocs = getVisibleDocIds(searcher, authentication);
        Set<String> tags = Sets.newHashSet();
        if (visibleDocs.length() > 0) {
            reader = searcher.getIndexReader();
            Terms terms = MultiFields.getTerms(reader, TAG);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef ref;
                while ((ref = termsEnum.next()) != null) {
                    DocsEnum docsEnum = termsEnum.docs(visibleDocs, null, 0);
                    if (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                        tags.add(ref.utf8ToString());
                    }
                }
            }
        }
        return tags;
    } finally {
        if (searcher != null) {
            searcherManager.release(searcher);
        }
    }
}

From source file:de.blizzy.documentr.search.TagFinder.java

License:Open Source License

public Set<String> getAllTags(Authentication authentication) throws IOException, TimeoutException {
    IndexReader reader = null;//ww w .  j  a  v a 2s.c o m
    IndexSearcher searcher = null;
    try {
        searcher = searcherManager.acquire();

        // no point in running the task asynchronously here
        GetVisibleDocIdsTask visibleDocIdsTask = new GetVisibleDocIdsTask(searcher, authentication, userStore,
                permissionEvaluator, taskExecutor);
        Bits visibleDocIds = visibleDocIdsTask.call();

        Set<String> tags = Sets.newHashSet();
        if (visibleDocIds.length() > 0) {
            reader = searcher.getIndexReader();
            Terms terms = MultiFields.getTerms(reader, PageIndex.TAG);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef ref;
                while ((ref = termsEnum.next()) != null) {
                    DocsEnum docsEnum = termsEnum.docs(visibleDocIds, null, 0);
                    if (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                        tags.add(ref.utf8ToString());
                    }
                }
            }
        }
        return tags;
    } finally {
        if (searcher != null) {
            searcherManager.release(searcher);
        }
    }
}

From source file:de.mirkosertic.desktopsearch.SearchPhraseSuggester.java

License:Open Source License

public List<Suggestion> suggestSearchPhrase(String aFieldName, String aPhrase) throws IOException {

    LOGGER.info("Trying to find suggestions for phrase " + aPhrase);

    long theStartTime = System.currentTimeMillis();
    try {//from w ww  .j a v a  2  s . c om
        List<String> theTokens = toTokens(aFieldName, aPhrase);

        List<SpanQuery> theSpanQueries = theTokens.stream().map(s -> {
            if (QueryUtils.isWildCard(s)) {
                WildcardQuery theWildcardQuery = new WildcardQuery(new Term(aFieldName, s));
                SpanMultiTermQueryWrapper theWrapper = new SpanMultiTermQueryWrapper(theWildcardQuery);
                try {
                    return theWrapper.getRewriteMethod().rewrite(indexReader, theWildcardQuery);
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
            return new SpanTermQuery(new Term(aFieldName, s));
        }).collect(Collectors.toList());

        SpanQuery theSpanQuery = new SpanNearQuery(theSpanQueries.toArray(new SpanQuery[theSpanQueries.size()]),
                configuration.getSuggestionSlop(), configuration.isSuggestionInOrder());

        LOGGER.info("created span query " + theSpanQuery);

        LeafReader theAtomicReader = SlowCompositeReaderWrapper.wrap(indexReader);

        Map<Term, TermContext> theTermContexts = new HashMap<>();
        Map<String, Long> theSpanFrequencies = new HashMap<>();

        // These are all the matching spans over all documents
        Spans theMatchingSpans = theSpanQuery.getSpans(theAtomicReader.getContext(),
                new Bits.MatchAllBits(indexReader.numDocs()), theTermContexts);

        while (theMatchingSpans.next()) {

            // This maps the position of a term and the term string itself
            // the positions must be in order, so we have to use a treemap.
            Map<Integer, String> theEntries = new TreeMap<>();

            Terms theAllTermsFromDocument = indexReader.getTermVector(theMatchingSpans.doc(),
                    IndexFields.CONTENT_NOT_STEMMED);
            int theSpanStart = theMatchingSpans.start() - configuration.getSuggestionWindowBefore();
            int theSpanEnd = theMatchingSpans.end() + configuration.getSuggestionWindowAfter();
            TermsEnum theTermsEnum = theAllTermsFromDocument.iterator(null);
            BytesRef theTerm;
            while ((theTerm = theTermsEnum.next()) != null) {
                DocsAndPositionsEnum thePositionEnum = theTermsEnum.docsAndPositions(null, null);
                if (thePositionEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                    int i = 0;
                    int position;
                    while (i < thePositionEnum.freq() && (position = thePositionEnum.nextPosition()) != -1) {
                        if (position >= theSpanStart && position <= theSpanEnd) {
                            theEntries.put(position, theTerm.utf8ToString());
                        }
                        i++;
                    }
                }
            }

            StringBuilder theResultString = new StringBuilder();
            theEntries.entrySet().forEach(e -> {
                if (theResultString.length() > 0) {
                    theResultString.append(" ");
                }
                theResultString.append(e.getValue());
            });

            String theTotalSpan = theResultString.toString().trim();

            Long theFrequency = theSpanFrequencies.get(theTotalSpan);
            if (theFrequency == null) {
                theSpanFrequencies.put(theTotalSpan, 1L);
            } else {
                theSpanFrequencies.put(theTotalSpan, theFrequency + 1);
            }
        }

        return theSpanFrequencies.entrySet().stream().filter(t -> t.getValue() > 1)
                .sorted((o1, o2) -> o2.getValue().compareTo(o1.getValue()))
                .limit(configuration.getNumberOfSuggestions())
                .map(T -> new Suggestion(highlight(T.getKey(), theTokens), T.getKey()))
                .collect(Collectors.toList());
    } finally {
        long theDuration = System.currentTimeMillis() - theStartTime;
        LOGGER.info("Took " + theDuration + "ms");
    }
}

From source file:de.tuberlin.dima.cuttlefish.preprocessing.vectorization.Vectorizer.java

License:Open Source License

public void vectorize(File luceneIndexDir, File outputDir) throws Exception {

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    SequenceFile.Writer writer = null;

    FeatureDictionary dict = new FeatureDictionary();

    DirectoryReader reader = null;/*  w ww . j ava  2  s .com*/
    try {
        reader = DirectoryReader.open(new SimpleFSDirectory(luceneIndexDir));

        writer = SequenceFile.createWriter(fs, conf, new Path(outputDir.toString(), "documentVectors.seq"),
                IDAndCodes.class, VectorWritable.class);
        IDAndCodes idAndCodes = new IDAndCodes();
        VectorWritable vectorWritable = new VectorWritable();

        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Iterator<String> fieldNames = fields.iterator();
            while (fieldNames.hasNext()) {
                String field = fieldNames.next();
                if (!field.startsWith("bip:") && !"itemID".equals(field)) {

                    Terms terms = fields.terms(field);
                    TermsEnum termsEnum = terms.iterator(null);
                    BytesRef text;
                    while ((text = termsEnum.next()) != null) {
                        dict.addTextFeature(field, text.utf8ToString());
                    }
                }
            }
        }

        int numDocsVectorized = 0;

        for (int docID = 0; docID < reader.maxDoc(); docID++) {
            Document doc = reader.document(docID);

            int itemID = doc.getField("itemID").numericValue().intValue();

            RandomAccessSparseVector documentVector = new RandomAccessSparseVector(dict.numFeatures());
            Multimap<String, String> codes = HashMultimap.create();

            for (IndexableField field : doc.getFields()) {

                String fieldName = field.name();

                if (!fieldName.startsWith("bip:") && !"itemID".equals(fieldName)) {

                    Terms termFreqVector = reader.getTermVector(docID, fieldName);

                    if (termFreqVector != null) {

                        int maxTermFrequency = maxTermFrequency(termFreqVector);

                        TermsEnum te = termFreqVector.iterator(null);
                        BytesRef term;

                        while ((term = te.next()) != null) {

                            String termStr = term.utf8ToString();
                            int termFrequency = (int) te.totalTermFreq();

                            int documentFrequency = reader.docFreq(new Term(fieldName, term));
                            int numDocs = reader.numDocs();

                            double weight = weighting.weight(fieldName, termStr, termFrequency,
                                    documentFrequency, maxTermFrequency, numDocs);

                            int featureIndex = dict.index(fieldName, term.utf8ToString());
                            documentVector.setQuick(featureIndex, weight);
                        }
                    }

                } else if (fieldName.startsWith("bip:")) {
                    for (String value : doc.getValues(fieldName)) {
                        codes.put(fieldName, value);
                    }
                }
            }

            Vector featureVector = new SequentialAccessSparseVector(documentVector);

            weighting.normalize(featureVector);

            idAndCodes.set(itemID, codes);
            vectorWritable.set(featureVector);
            writer.append(idAndCodes, vectorWritable);

            numDocsVectorized++;
            if (numDocsVectorized % 100 == 0) {
                log.info("Vectorized {} documents", numDocsVectorized);
            }
        }

        log.info("Vectorized {} documents", numDocsVectorized);

        dict.writeToFile(new File(outputDir, "features.txt"));

        log.info("Wrote feature dictionary");

    } finally {
        Closeables.close(reader, true);
        Closeables.close(writer, true);
    }

}

From source file:de.tudarmstadt.ukp.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase.java

License:Apache License

@Override
protected FrequencyDistribution<String> getTopNgrams() throws ResourceInitializationException {

    FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>();

    MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(getTopN()).create();

    long ngramVocabularySize = 0;
    IndexReader reader;//  w  w w .  jav  a  2 s.com
    try {
        reader = DirectoryReader.open(FSDirectory.open(luceneDir));
        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Terms terms = fields.terms(getFieldName());
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    String term = text.utf8ToString();
                    long freq = termsEnum.totalTermFreq();
                    if (passesScreening(term)) {
                        topN.add(new TermFreqTuple(term, freq));
                        ngramVocabularySize += freq;
                    }
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    int size = topN.size();
    for (int i = 0; i < size; i++) {
        TermFreqTuple tuple = topN.poll();
        long absCount = tuple.getFreq();
        double relFrequency = ((double) absCount) / ngramVocabularySize;

        if (relFrequency >= ngramFreqThreshold) {
            topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
        }
    }

    logSelectionProcess(topNGrams.getB());

    return topNGrams;
}

From source file:de.tudarmstadt.ukp.dkpro.tc.features.ngram.meta.LuceneNGramMetaCollectorTest.java

License:Apache License

@Test
public void luceneNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en",
            TextReader.PARAM_PATTERNS, "text*.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory
            .createEngineDescription(LuceneNGramMetaCollector.class, LuceneNGramDFE.PARAM_LUCENE_DIR, tmpDir);

    for (JCas jcas : new JCasIterable(reader, segmenter, metaCollector)) {
        //            System.out.println(jcas.getDocumentText().length());
    }/*from ww w  . j a v a  2 s  .  c  o  m*/

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                //                    Bits liveDocs = MultiFields.getLiveDocs(index);
                //                    DocsEnum docs = termsEnum.docs(liveDocs, null);
                //                    int docId;
                //                    while((docId = docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
                //                        index.g
                //                    }
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    //                        System.out.println(text.utf8ToString() + " - " + termsEnum.totalTermFreq());
                    //                        System.out.println(termsEnum.docFreq());

                    if (text.utf8ToString().equals("this")) {
                        assertEquals(2, termsEnum.docFreq());
                        assertEquals(3, termsEnum.totalTermFreq());
                    }

                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(35, i);
}