List of usage examples for org.apache.lucene.index IndexReader document
public final Document document(int docID) throws IOException
n
th Document
in this index. From source file:vagueobjects.ir.lda.lucene.Indexer.java
License:Apache License
private void collect(Term term, IndexReader reader, IndexWriter writer, String[] fieldNames) throws IOException { int numDocs = reader.numDocs(); String field = term.field();/*from ww w .j av a2 s .co m*/ String value = term.text(); int count = 0; for (int d = 0; d < numDocs; ++d) { Document source = reader.document(d); if (!reader.isDeleted(d) && value.equals(source.get(field))) { ++count; if (count % 100000 == 0) { logger.debug("Passed " + count + " documents"); } Document document = new Document(); for (String fieldName : fieldNames) { String v = source.get(fieldName); if (v != null) { document.add(new Field(FIELD, v, Field.Store.YES, Field.Index.ANALYZED)); } } writer.addDocument(document); } } if (count == 0) { throw new IllegalStateException("No matching documents found"); } }
From source file:vectorizer.TermInfo.java
private DocVector buildTerms(IndexReader reader, int docId, int numDocs, Dictionary dict) throws Exception { DocVector wmap = new DocVector(reader.document(docId).get(ID_FIELD_NAME)); Terms tfvector;/*from w w w . java 2 s.c o m*/ TermsEnum termsEnum; String termText; BytesRef term; int tf; float idf; tfvector = reader.getTermVector(docId, CONTENT_FIELD_NAME); if (tfvector == null) return null; // Construct the normalized tf vector termsEnum = tfvector.iterator(); // access the terms for this field while ((term = termsEnum.next()) != null) { // explore the terms for this field tf = (int) termsEnum.totalTermFreq(); termText = term.utf8ToString(); float df = reader.docFreq(new Term(CONTENT_FIELD_NAME, termText)); idf = (float) Math.log(1 + numDocs / df); TermInfo termInfo = new TermInfo(termText, tf, getTermId(termText), idf); if (dict != null) { Translations translations = dict.getTranslationTerms(termText); for (TranslationInfo tinfo : translations.getTranslationInfo()) { termInfo.tf *= tinfo.weight; } } // Update global stats TermInfo seenTermInfo = collFreq.get(termText); if (seenTermInfo == null) { seenTermInfo = new TermInfo(termInfo.term, termInfo.tf, termInfo.id, termInfo.idf); collFreq.put(termText, seenTermInfo); } else { seenTermInfo.tf += termInfo.tf; // coll freq } wmap.addTermInfo(termInfo); } return wmap; }
From source file:wvec.WordVecsIndexer.java
void clusterWordVecs(IndexWriter clusterIndexWriter, int numClusters) throws Exception { // Index where word vectors are stored IndexReader reader = DirectoryReader.open(FSDirectory.open((new File(indexPath)).toPath())); int numDocs = reader.numDocs(); KMeansPlusPlusClusterer<WordVec> clusterer = new KMeansPlusPlusClusterer<>(numClusters); List<WordVec> wordList = new ArrayList<>(numDocs); // Read every wvec and load in memory for (int i = 0; i < numDocs; i++) { Document doc = reader.document(i); WordVec wvec = new WordVec(doc.get(FIELD_WORD_VEC)); wordList.add(wvec);/*www. j av a2s . c o m*/ } // Call K-means clustering System.out.println("Clustering the entire vocabulary..."); List<CentroidCluster<WordVec>> clusters = clusterer.cluster(wordList); // Save the cluster info System.out.println("Writing out cluster ids in Lucene index..."); int clusterId = 0; for (CentroidCluster<WordVec> c : clusters) { List<WordVec> pointsInThisClusuter = c.getPoints(); for (WordVec thisPoint : pointsInThisClusuter) { Document clusterInfo = constructDoc(thisPoint.word, String.valueOf(clusterId)); clusterIndexWriter.addDocument(clusterInfo); } clusterId++; } reader.close(); }