Example usage for org.apache.lucene.index IndexReader docFreq

List of usage examples for org.apache.lucene.index IndexReader docFreq

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader docFreq.

Prototype

public abstract int docFreq(Term term) throws IOException;

Source Link

Document

Returns the number of documents containing the term.

Usage

From source file:ContentBasedAnalysis.java

License:Apache License

private static int docFreq(IndexReader reader, String s) throws Exception {
    return reader.docFreq(new Term("contents", s));
}

From source file:alix.lucene.MoreLikeThis.java

License:Apache License

/**
 * A Cli test //from  ww  w  .j  a  v  a2  s. c  o m
 * @return 
 */
public static void main(String args[]) throws Exception {
    String usage = "java com.github.oeuvres.lucene.MoreLikeThis" + " ../lucene-index \n\n"
            + "Parse the files in corpus, with xsl parser, to be indexed in lucene index directory";
    if (args.length < 1) {
        System.err.println("Usage: " + usage);
        System.exit(1);
    }
    IndexReader ir = DirectoryReader.open(FSDirectory.open(Paths.get(args[0])));
    MoreLikeThis mlt = new MoreLikeThis(ir);
    mlt.setMinTermFreq(0);
    mlt.setMinDocFreq(0);
    // TODO select Field
    String fieldName = "text";
    mlt.setFieldNames(new String[] { fieldName });

    int maxDoc = ir.docFreq(new Term("type", "chapter"));
    System.out.println("<p>Out of " + maxDoc + "</p>");
    /* 
    Should use a doc Freq level to clean grammatical words
    On a Zola chapter
    100%: de, la il, le, et, , l, un, d
    100% - 1: il, et, vous
    99%: on, a, est, vous 
    95%: elles
    90%: chez, a
    50%: noms propres
    */
    mlt.setMaxDocFreq((int) Math.round(maxDoc * 0.50));
    mlt.setLower(true);

    int docnum;
    BufferedReader keyboard = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    while (true) {
        System.out.println("Doc no:");
        String line = keyboard.readLine().trim();
        if (line == null || "".equals(line))
            System.exit(0);
        docnum = Integer.parseInt(line);
        System.out.println(Arrays.toString(mlt.retrieveInterestingTerms(docnum)));
    }
}

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.termExtract.LuceneTopTermExtract.java

License:Apache License

private void updateFrequenciesMapsForTerm(Map<String, Double> freqMap, Map<String, Integer> docFreqMap,
        String field, String term, IndexReader reader, double weight) throws IOException {
    if (freqMap.containsKey(term)) {
        freqMap.put(term, freqMap.get(term) + weight);
    } else {/* www  . j  a va 2 s. c o  m*/
        freqMap.put(term, weight);
    }

    int docFreq = reader.docFreq(new Term(field, term));
    if (docFreqMap.containsKey(term)) {
        docFreqMap.put(term, docFreqMap.get(term) + docFreq);
    } else {
        docFreqMap.put(term, docFreq);
    }
}

From source file:be.ugent.tiwi.sleroux.newsrec.recommendationstester.LuceneTopTermExtract.java

License:Apache License

private void updateFrequenciesMaps(Map<String, Double> freqMap, Map<String, Integer> docFreqMap, String field,
        String content, IndexReader reader, double weight) throws IOException {
    if (content != null && !"".equals(content)) {
        try (TokenStream stream = analyzer.tokenStream(field, content)) {
            stream.reset();/* w  ww  . j  a va  2  s . c  o m*/

            while (stream.incrementToken()) {
                String term = stream.getAttribute(CharTermAttribute.class).toString();
                term = term.trim();
                if (freqMap.containsKey(term)) {
                    freqMap.put(term, freqMap.get(term) + weight);
                } else {
                    freqMap.put(term, weight);
                }

                int docFreq = reader.docFreq(new Term(field, term));
                if (docFreqMap.containsKey(content)) {
                    docFreqMap.put(term, docFreqMap.get(content) + docFreq);
                } else {
                    docFreqMap.put(term, docFreq);
                }
            }
        }
    }
}

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected Map<String, int[]> parseD1Index(IndexReader d1Index, IndexReader d2Index) {
    try {/*from w  w  w .  j a v  a2 s  .  c o m*/
        int[] documentIds = getDocumentIds(d1Index);
        final Map<String, int[]> hashedBlocks = new HashMap<>();
        Fields fields = MultiFields.getFields(d1Index);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                // check whether it is a common term
                int d2DocFrequency = d2Index.docFreq(new Term(field, text));
                if (d2DocFrequency == 0) {
                    continue;
                }

                final List<Integer> entityIds = new ArrayList<>();
                PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text);
                int doc;
                while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    entityIds.add(documentIds[doc]);
                }

                int[] idsArray = Converter.convertCollectionToArray(entityIds);
                hashedBlocks.put(text.utf8ToString(), idsArray);
            }
        }
        return hashedBlocks;
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
        return null;
    }
}

From source file:BlockBuilding.AbstractExtendedSortedNeighborhoodBlocking.java

License:Open Source License

@Override
protected void parseIndices() {
    IndexReader d1Reader = Utilities.openReader(indexDirectory[0]);
    IndexReader d2Reader = Utilities.openReader(indexDirectory[1]);

    final Set<String> blockingKeysSet = getTerms(d1Reader);
    blockingKeysSet.addAll(getTerms(d2Reader));
    String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]);
    Arrays.sort(sortedTerms);//  w  w  w  .j av a  2s  .  co m

    //slide window over the sorted list of blocking keys
    int upperLimit = sortedTerms.length - windowSize;
    int[] documentIdsD1 = Utilities.getDocumentIds(d1Reader);
    int[] documentIdsD2 = Utilities.getDocumentIds(d2Reader);
    for (int i = 0; i <= upperLimit; i++) {
        final Set<Integer> entityIds1 = new HashSet<>();
        final Set<Integer> entityIds2 = new HashSet<>();
        for (int j = 0; j < windowSize; j++) {
            try {
                int docFrequency = d1Reader.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j]));
                if (0 < docFrequency) {
                    entityIds1.addAll(getTermEntities(documentIdsD1, d1Reader, sortedTerms[i + j]));
                }

                docFrequency = d2Reader.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j]));
                if (0 < docFrequency) {
                    entityIds2.addAll(getTermEntities(documentIdsD2, d2Reader, sortedTerms[i + j]));
                }
            } catch (IOException ex) {
                ex.printStackTrace();
            }
        }

        if (!entityIds1.isEmpty() && !entityIds2.isEmpty()) {
            int[] idsArray1 = Converter.convertCollectionToArray(entityIds1);
            int[] idsArray2 = Converter.convertCollectionToArray(entityIds2);
            BilateralBlock bBlock = new BilateralBlock(idsArray1, idsArray2);
            blocks.add(bBlock);
        }
    }

    noOfEntities = new double[2];
    noOfEntities[0] = d1Reader.numDocs();
    noOfEntities[1] = d2Reader.numDocs();

    Utilities.closeReader(d1Reader);
    Utilities.closeReader(d2Reader);
}

From source file:BlockBuilding.AbstractSortedNeighborhoodBlocking.java

License:Open Source License

protected List<Integer> getTermEntities(int[] docIds, IndexReader iReader, String blockingKey) {
    try {/*from  w ww  . ja v  a  2 s .  co m*/
        Term term = new Term(VALUE_LABEL, blockingKey);
        List<Integer> entityIds = new ArrayList<>();
        int docFrequency = iReader.docFreq(term);
        if (0 < docFrequency) {
            BytesRef text = term.bytes();
            DocsEnum de = MultiFields.getTermDocsEnum(iReader, MultiFields.getLiveDocs(iReader), VALUE_LABEL,
                    text);
            int doc;
            while ((doc = de.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
                entityIds.add(docIds[doc]);
            }
        }

        return entityIds;
    } catch (IOException ex) {
        ex.printStackTrace();
        return null;
    }
}

From source file:BlockBuilding.ExtendedSortedNeighborhoodBlocking.java

License:Apache License

@Override
protected void parseIndices(IndexReader iReader1, IndexReader iReader2) {
    final Set<String> blockingKeysSet = getTerms(iReader1);
    blockingKeysSet.addAll(getTerms(iReader2));
    String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]);
    Arrays.sort(sortedTerms);/* w  w  w. j a va  2 s .c o  m*/

    //slide window over the sorted list of blocking keys
    int upperLimit = sortedTerms.length - windowSize;
    int[] documentIdsD1 = getDocumentIds(iReader1);
    int[] documentIdsD2 = getDocumentIds(iReader2);
    for (int i = 0; i <= upperLimit; i++) {
        final Set<Integer> entityIds1 = new HashSet<>();
        final Set<Integer> entityIds2 = new HashSet<>();
        for (int j = 0; j < windowSize; j++) {
            try {
                int docFrequency = iReader1.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j]));
                if (0 < docFrequency) {
                    entityIds1.addAll(getTermEntities(documentIdsD1, iReader1, sortedTerms[i + j]));
                }

                docFrequency = iReader2.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j]));
                if (0 < docFrequency) {
                    entityIds2.addAll(getTermEntities(documentIdsD2, iReader2, sortedTerms[i + j]));
                }
            } catch (IOException ex) {
                LOGGER.log(Level.SEVERE, null, ex);
            }
        }

        if (!entityIds1.isEmpty() && !entityIds2.isEmpty()) {
            int[] idsArray1 = Converter.convertCollectionToArray(entityIds1);
            int[] idsArray2 = Converter.convertCollectionToArray(entityIds2);
            BilateralBlock bBlock = new BilateralBlock(idsArray1, idsArray2);
            blocks.add(bBlock);
        }
    }
}

From source file:BlockBuilding.MemoryBased.SchemaBased.ExtendedSortedNeighborhood.java

License:Open Source License

@Override
protected void parseIndices() {
    IndexReader d1Reader = Utilities.openReader(indexDirectory[0]);
    IndexReader d2Reader = Utilities.openReader(indexDirectory[1]);

    final Set<String> blockingKeysSet = getTerms(d1Reader);
    blockingKeysSet.addAll(getTerms(d2Reader));
    String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]);
    Arrays.sort(sortedTerms);/*from   ww  w  . j av  a2  s  .c  om*/

    //slide window over the sorted list of blocking keys
    int upperLimit = sortedTerms.length - windowSize;
    int[] documentIdsD1 = Utilities.getDocumentIds(d1Reader);
    int[] documentIdsD2 = Utilities.getDocumentIds(d2Reader);
    for (int i = 0; i <= upperLimit; i++) {
        final Set<Integer> entityIds1 = new HashSet<>();
        final Set<Integer> entityIds2 = new HashSet<>();
        for (int j = 0; j < windowSize; j++) {
            try {
                int docFrequency = d1Reader.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j]));
                if (0 < docFrequency) {
                    entityIds1.addAll(getTermEntities(documentIdsD1, d1Reader, sortedTerms[i + j]));
                }

                docFrequency = d2Reader.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j]));
                if (0 < docFrequency) {
                    entityIds1.addAll(getTermEntities(documentIdsD2, d2Reader, sortedTerms[i + j]));
                }
            } catch (IOException ex) {
                ex.printStackTrace();
            }
        }

        if (!entityIds1.isEmpty() && !entityIds2.isEmpty()) {
            int[] idsArray1 = Converter.convertCollectionToArray(entityIds1);
            int[] idsArray2 = Converter.convertCollectionToArray(entityIds2);
            BilateralBlock bBlock = new BilateralBlock(idsArray1, idsArray2);
            blocks.add(bBlock);
        }
    }
}

From source file:BlockBuilding.SortedNeighborhoodBlocking.java

License:Apache License

protected List<Integer> getTermEntities(int[] docIds, IndexReader iReader, String blockingKey) {
    try {// w  w w  .ja v  a2 s  .  c o  m
        Term term = new Term(VALUE_LABEL, blockingKey);
        List<Integer> entityIds = new ArrayList<>();
        int docFrequency = iReader.docFreq(term);
        if (0 < docFrequency) {
            BytesRef text = term.bytes();
            PostingsEnum pe = MultiFields.getTermDocsEnum(iReader, VALUE_LABEL, text);
            int doc;
            while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                entityIds.add(docIds[doc]);
            }
        }

        return entityIds;
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
        return null;
    }
}