List of usage examples for org.apache.lucene.index IndexReader docFreq
public abstract int docFreq(Term term) throws IOException;
term
. From source file:ContentBasedAnalysis.java
License:Apache License
private static int docFreq(IndexReader reader, String s) throws Exception { return reader.docFreq(new Term("contents", s)); }
From source file:alix.lucene.MoreLikeThis.java
License:Apache License
/** * A Cli test //from ww w .j a v a2 s. c o m * @return */ public static void main(String args[]) throws Exception { String usage = "java com.github.oeuvres.lucene.MoreLikeThis" + " ../lucene-index \n\n" + "Parse the files in corpus, with xsl parser, to be indexed in lucene index directory"; if (args.length < 1) { System.err.println("Usage: " + usage); System.exit(1); } IndexReader ir = DirectoryReader.open(FSDirectory.open(Paths.get(args[0]))); MoreLikeThis mlt = new MoreLikeThis(ir); mlt.setMinTermFreq(0); mlt.setMinDocFreq(0); // TODO select Field String fieldName = "text"; mlt.setFieldNames(new String[] { fieldName }); int maxDoc = ir.docFreq(new Term("type", "chapter")); System.out.println("<p>Out of " + maxDoc + "</p>"); /* Should use a doc Freq level to clean grammatical words On a Zola chapter 100%: de, la il, le, et, , l, un, d 100% - 1: il, et, vous 99%: on, a, est, vous 95%: elles 90%: chez, a 50%: noms propres */ mlt.setMaxDocFreq((int) Math.round(maxDoc * 0.50)); mlt.setLower(true); int docnum; BufferedReader keyboard = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); while (true) { System.out.println("Doc no:"); String line = keyboard.readLine().trim(); if (line == null || "".equals(line)) System.exit(0); docnum = Integer.parseInt(line); System.out.println(Arrays.toString(mlt.retrieveInterestingTerms(docnum))); } }
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.termExtract.LuceneTopTermExtract.java
License:Apache License
private void updateFrequenciesMapsForTerm(Map<String, Double> freqMap, Map<String, Integer> docFreqMap, String field, String term, IndexReader reader, double weight) throws IOException { if (freqMap.containsKey(term)) { freqMap.put(term, freqMap.get(term) + weight); } else {/* www . j a va 2 s. c o m*/ freqMap.put(term, weight); } int docFreq = reader.docFreq(new Term(field, term)); if (docFreqMap.containsKey(term)) { docFreqMap.put(term, docFreqMap.get(term) + docFreq); } else { docFreqMap.put(term, docFreq); } }
From source file:be.ugent.tiwi.sleroux.newsrec.recommendationstester.LuceneTopTermExtract.java
License:Apache License
private void updateFrequenciesMaps(Map<String, Double> freqMap, Map<String, Integer> docFreqMap, String field, String content, IndexReader reader, double weight) throws IOException { if (content != null && !"".equals(content)) { try (TokenStream stream = analyzer.tokenStream(field, content)) { stream.reset();/* w ww . j a va 2 s . c o m*/ while (stream.incrementToken()) { String term = stream.getAttribute(CharTermAttribute.class).toString(); term = term.trim(); if (freqMap.containsKey(term)) { freqMap.put(term, freqMap.get(term) + weight); } else { freqMap.put(term, weight); } int docFreq = reader.docFreq(new Term(field, term)); if (docFreqMap.containsKey(content)) { docFreqMap.put(term, docFreqMap.get(content) + docFreq); } else { docFreqMap.put(term, docFreq); } } } } }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected Map<String, int[]> parseD1Index(IndexReader d1Index, IndexReader d2Index) { try {/*from w w w . j a v a2 s . c o m*/ int[] documentIds = getDocumentIds(d1Index); final Map<String, int[]> hashedBlocks = new HashMap<>(); Fields fields = MultiFields.getFields(d1Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { // check whether it is a common term int d2DocFrequency = d2Index.docFreq(new Term(field, text)); if (d2DocFrequency == 0) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); hashedBlocks.put(text.utf8ToString(), idsArray); } } return hashedBlocks; } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); return null; } }
From source file:BlockBuilding.AbstractExtendedSortedNeighborhoodBlocking.java
License:Open Source License
@Override protected void parseIndices() { IndexReader d1Reader = Utilities.openReader(indexDirectory[0]); IndexReader d2Reader = Utilities.openReader(indexDirectory[1]); final Set<String> blockingKeysSet = getTerms(d1Reader); blockingKeysSet.addAll(getTerms(d2Reader)); String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]); Arrays.sort(sortedTerms);// w w w .j av a 2s . co m //slide window over the sorted list of blocking keys int upperLimit = sortedTerms.length - windowSize; int[] documentIdsD1 = Utilities.getDocumentIds(d1Reader); int[] documentIdsD2 = Utilities.getDocumentIds(d2Reader); for (int i = 0; i <= upperLimit; i++) { final Set<Integer> entityIds1 = new HashSet<>(); final Set<Integer> entityIds2 = new HashSet<>(); for (int j = 0; j < windowSize; j++) { try { int docFrequency = d1Reader.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j])); if (0 < docFrequency) { entityIds1.addAll(getTermEntities(documentIdsD1, d1Reader, sortedTerms[i + j])); } docFrequency = d2Reader.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j])); if (0 < docFrequency) { entityIds2.addAll(getTermEntities(documentIdsD2, d2Reader, sortedTerms[i + j])); } } catch (IOException ex) { ex.printStackTrace(); } } if (!entityIds1.isEmpty() && !entityIds2.isEmpty()) { int[] idsArray1 = Converter.convertCollectionToArray(entityIds1); int[] idsArray2 = Converter.convertCollectionToArray(entityIds2); BilateralBlock bBlock = new BilateralBlock(idsArray1, idsArray2); blocks.add(bBlock); } } noOfEntities = new double[2]; noOfEntities[0] = d1Reader.numDocs(); noOfEntities[1] = d2Reader.numDocs(); Utilities.closeReader(d1Reader); Utilities.closeReader(d2Reader); }
From source file:BlockBuilding.AbstractSortedNeighborhoodBlocking.java
License:Open Source License
protected List<Integer> getTermEntities(int[] docIds, IndexReader iReader, String blockingKey) { try {/*from w ww . ja v a 2 s . co m*/ Term term = new Term(VALUE_LABEL, blockingKey); List<Integer> entityIds = new ArrayList<>(); int docFrequency = iReader.docFreq(term); if (0 < docFrequency) { BytesRef text = term.bytes(); DocsEnum de = MultiFields.getTermDocsEnum(iReader, MultiFields.getLiveDocs(iReader), VALUE_LABEL, text); int doc; while ((doc = de.nextDoc()) != DocsEnum.NO_MORE_DOCS) { entityIds.add(docIds[doc]); } } return entityIds; } catch (IOException ex) { ex.printStackTrace(); return null; } }
From source file:BlockBuilding.ExtendedSortedNeighborhoodBlocking.java
License:Apache License
@Override protected void parseIndices(IndexReader iReader1, IndexReader iReader2) { final Set<String> blockingKeysSet = getTerms(iReader1); blockingKeysSet.addAll(getTerms(iReader2)); String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]); Arrays.sort(sortedTerms);/* w w w. j a va 2 s .c o m*/ //slide window over the sorted list of blocking keys int upperLimit = sortedTerms.length - windowSize; int[] documentIdsD1 = getDocumentIds(iReader1); int[] documentIdsD2 = getDocumentIds(iReader2); for (int i = 0; i <= upperLimit; i++) { final Set<Integer> entityIds1 = new HashSet<>(); final Set<Integer> entityIds2 = new HashSet<>(); for (int j = 0; j < windowSize; j++) { try { int docFrequency = iReader1.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j])); if (0 < docFrequency) { entityIds1.addAll(getTermEntities(documentIdsD1, iReader1, sortedTerms[i + j])); } docFrequency = iReader2.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j])); if (0 < docFrequency) { entityIds2.addAll(getTermEntities(documentIdsD2, iReader2, sortedTerms[i + j])); } } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); } } if (!entityIds1.isEmpty() && !entityIds2.isEmpty()) { int[] idsArray1 = Converter.convertCollectionToArray(entityIds1); int[] idsArray2 = Converter.convertCollectionToArray(entityIds2); BilateralBlock bBlock = new BilateralBlock(idsArray1, idsArray2); blocks.add(bBlock); } } }
From source file:BlockBuilding.MemoryBased.SchemaBased.ExtendedSortedNeighborhood.java
License:Open Source License
@Override protected void parseIndices() { IndexReader d1Reader = Utilities.openReader(indexDirectory[0]); IndexReader d2Reader = Utilities.openReader(indexDirectory[1]); final Set<String> blockingKeysSet = getTerms(d1Reader); blockingKeysSet.addAll(getTerms(d2Reader)); String[] sortedTerms = blockingKeysSet.toArray(new String[blockingKeysSet.size()]); Arrays.sort(sortedTerms);/*from ww w . j av a2 s .c om*/ //slide window over the sorted list of blocking keys int upperLimit = sortedTerms.length - windowSize; int[] documentIdsD1 = Utilities.getDocumentIds(d1Reader); int[] documentIdsD2 = Utilities.getDocumentIds(d2Reader); for (int i = 0; i <= upperLimit; i++) { final Set<Integer> entityIds1 = new HashSet<>(); final Set<Integer> entityIds2 = new HashSet<>(); for (int j = 0; j < windowSize; j++) { try { int docFrequency = d1Reader.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j])); if (0 < docFrequency) { entityIds1.addAll(getTermEntities(documentIdsD1, d1Reader, sortedTerms[i + j])); } docFrequency = d2Reader.docFreq(new Term(VALUE_LABEL, sortedTerms[i + j])); if (0 < docFrequency) { entityIds1.addAll(getTermEntities(documentIdsD2, d2Reader, sortedTerms[i + j])); } } catch (IOException ex) { ex.printStackTrace(); } } if (!entityIds1.isEmpty() && !entityIds2.isEmpty()) { int[] idsArray1 = Converter.convertCollectionToArray(entityIds1); int[] idsArray2 = Converter.convertCollectionToArray(entityIds2); BilateralBlock bBlock = new BilateralBlock(idsArray1, idsArray2); blocks.add(bBlock); } } }
From source file:BlockBuilding.SortedNeighborhoodBlocking.java
License:Apache License
protected List<Integer> getTermEntities(int[] docIds, IndexReader iReader, String blockingKey) { try {// w w w .ja v a2 s . c o m Term term = new Term(VALUE_LABEL, blockingKey); List<Integer> entityIds = new ArrayList<>(); int docFrequency = iReader.docFreq(term); if (0 < docFrequency) { BytesRef text = term.bytes(); PostingsEnum pe = MultiFields.getTermDocsEnum(iReader, VALUE_LABEL, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(docIds[doc]); } } return entityIds; } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); return null; } }