List of usage examples for org.apache.lucene.index Fields terms
public abstract Terms terms(String field) throws IOException;
From source file:VocabDumper.java
void printWords() throws IOException { IndexReader indexReader = DirectoryReader.open(FSDirectory.open(indexDir)); Fields fields = MultiFields.getFields(indexReader); String[] fieldNames = { MSIRDoc.FIELD_TITLE_EN, MSIRDoc.FIELD_EN, MSIRDoc.FIELD_TITLE_HN, MSIRDoc.FIELD_HN };/*w w w. j a v a2 s . c o m*/ for (String fieldName : fieldNames) { Terms terms = fields.terms(fieldName); TermsEnum iterator = terms.iterator(null); BytesRef byteRef = null; while ((byteRef = iterator.next()) != null) { String term = new String(byteRef.bytes, byteRef.offset, byteRef.length); if (term.indexOf('#') == -1) System.out.println(term); } } }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected Map<String, int[]> parseD1Index(IndexReader d1Index, IndexReader d2Index) { try {// ww w. ja va 2s.c o m int[] documentIds = getDocumentIds(d1Index); final Map<String, int[]> hashedBlocks = new HashMap<>(); Fields fields = MultiFields.getFields(d1Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { // check whether it is a common term int d2DocFrequency = d2Index.docFreq(new Term(field, text)); if (d2DocFrequency == 0) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); hashedBlocks.put(text.utf8ToString(), idsArray); } } return hashedBlocks; } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); return null; } }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected void parseD2Index(IndexReader d2Index, Map<String, int[]> hashedBlocks) { try {//from ww w . ja va2 s .c o m int[] documentIds = getDocumentIds(d2Index); Fields fields = MultiFields.getFields(d2Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { if (!hashedBlocks.containsKey(text.utf8ToString())) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d2Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); int[] d1Entities = hashedBlocks.get(text.utf8ToString()); blocks.add(new BilateralBlock(d1Entities, idsArray)); } } } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); } }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected void parseIndex(IndexReader d1Index) { try {//from ww w . j av a 2 s. c o m int[] documentIds = getDocumentIds(d1Index); Fields fields = MultiFields.getFields(d1Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { if (termsEnum.docFreq() < 2) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); UnilateralBlock block = new UnilateralBlock(idsArray); blocks.add(block); } } } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); } }
From source file:BlockBuilding.AbstractSortedNeighborhoodBlocking.java
License:Open Source License
protected Set<String> getTerms(IndexReader iReader) { Set<String> sortedTerms = new HashSet<>(); try {// w ww . j a va2s. c om Fields fields = MultiFields.getFields(iReader); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { sortedTerms.add(text.utf8ToString()); } } } catch (IOException ex) { ex.printStackTrace(); } return sortedTerms; }
From source file:BlockBuilding.SortedNeighborhoodBlocking.java
License:Apache License
protected Set<String> getTerms(IndexReader iReader) { Set<String> sortedTerms = new HashSet<>(); try {//from w ww.ja va 2 s . co m Fields fields = MultiFields.getFields(iReader); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { sortedTerms.add(text.utf8ToString()); } } } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); } return sortedTerms; }
From source file:com.core.nlp.index.IndexReader.java
License:Apache License
/** Retrieve term vector for this document and field, or * null if term vectors were not indexed. The returned * Fields instance acts like a single-document inverted * index (the docID will be 0). */ public final Terms getTermVector(int docID, String field) throws IOException { Fields vectors = getTermVectors(docID); if (vectors == null) { return null; }/*from w w w . j a v a2s .c o m*/ return vectors.terms(field); }
From source file:com.core.nlp.query.MoreLikeThis.java
License:Apache License
/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms *//*from w w w.j a v a2 s.c o m*/ private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField[] fields = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName); } } } else { addTermFrequencies(termFreqMap, vector); } } return createQueue(termFreqMap); }
From source file:com.factweavers.elasticsearch.payloadscorefunction.PayloadScoringFunction.java
License:Apache License
@Override public double score(int docId, float subQueryScore) { indexLookup.setNextDocId(docId);//from ww w . ja v a 2s. c o m float score = 0; int obtainedTerms = 0; try { Fields termVectors = indexLookup.termVectors(); Boolean isPayloadOrIndex = false; TermsEnum iterator = null; if (termVectors != null && termVectors.terms(field) != null && termVectors.terms(field).hasPayloads()) { isPayloadOrIndex = true; Terms fields = termVectors.terms(field); iterator = fields.iterator(null); } if (isPayloadOrIndex) { BytesRef firstElement = iterator.next(); while (firstElement != null && (obtainedTerms < values.size())) { String currentValue = firstElement.utf8ToString(); if (!values.contains(currentValue)) { //logger.info("Payload Skipping " + currentValue); firstElement = iterator.next(); continue; } else { obtainedTerms++; } //logger.info("Payload processing value is " + currentValue); DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); docsAndPositions.nextDoc(); docsAndPositions.nextPosition(); BytesRef payload = docsAndPositions.getPayload(); if (payload != null) { score += PayloadHelper.decodeFloat(payload.bytes, payload.offset); //logger.info("Score " + score); } else { score += defaultValue; } firstElement = iterator.next(); } } else { IndexField fieldObject = indexLookup.get(field); for (String value : values) { IndexFieldTerm tokens = fieldObject.get(value, IndexLookup.FLAG_CACHE | IndexLookup.FLAG_PAYLOADS); if (fieldObject != null && tokens != null) { //logger.info("Processing docID=" + docId + " " + field // + " for " + value + " , " + tokens); if (tokens.iterator().hasNext()) { score += tokens.iterator().next().payloadAsFloat(defaultValue); } } } } } catch (IOException e) { //logger.info("Exception in Term Vectors"); e.printStackTrace(); } return new Double(score); }
From source file:com.github.flaxsearch.resources.TermsResource.java
License:Apache License
@GET public TermsData getTerms(@QueryParam("segment") Integer segment, @PathParam("field") String field, @QueryParam("from") String startTerm, @QueryParam("filter") String filter, @QueryParam("encoding") @DefaultValue("utf8") String encoding, @QueryParam("count") @DefaultValue("50") int count) throws IOException { try {/*from w ww .j a v a 2 s . com*/ Fields fields = readerManager.getFields(segment); Terms terms = fields.terms(field); if (terms == null) throw new WebApplicationException("No such field " + field, Response.Status.NOT_FOUND); TermsEnum te = getTermsEnum(terms, filter); List<String> collected = new ArrayList<>(); if (startTerm != null) { BytesRef start = BytesRefUtils.decode(startTerm, encoding); if (te.seekCeil(start) == TermsEnum.SeekStatus.END) return new TermsData(terms, Collections.emptyList(), encoding); } else { if (te.next() == null) { return new TermsData(terms, Collections.emptyList(), encoding); } } do { collected.add(BytesRefUtils.encode(te.term(), encoding)); } while (te.next() != null && --count > 0); return new TermsData(terms, collected, encoding); } catch (NumberFormatException e) { throw new WebApplicationException("Field " + field + " cannot be decoded as " + encoding, Response.Status.BAD_REQUEST); } }