List of usage examples for org.apache.lucene.index IndexReader getTermVector
public final Terms getTermVector(int docID, String field) throws IOException
From source file:lucene.CosineDocumentSimilarity.java
Map<String, Integer> getTermFrequencies(IndexReader reader, int docId) throws IOException { Terms vector = reader.getTermVector(docId, CONTENT); TermsEnum termsEnum = null;/*w w w . j ava2 s . com*/ // termsEnum = vector.iterator(termsEnum); Map<String, Integer> frequencies = new HashMap<>(); BytesRef text = null; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int freq = (int) termsEnum.totalTermFreq(); frequencies.put(term, freq); terms.add(term); } return frequencies; }
From source file:lucenetools.TermData.java
License:Apache License
/** * Main application. //from ww w .j a v a 2 s.c o m * * @param args the command line arguments */ public static void main(String[] args) { Options opts = new Options(); CommandLine commandLine = new CommandLine(); // if no command line options specified, user wants help if (0 == args.length) { commandLine.showHelp(); System.exit(0); } // extract command line args and store in opts if (!commandLine.parse(args, opts)) System.exit(1); if (opts.showHelp) { commandLine.showHelp(); System.exit(0); } // validate all command line options if (!commandLine.isValid(opts)) System.exit(1); // report all command line options to the user System.out.println("\nLuceneToMtx version " + VERSION + "."); commandLine.printOpts(opts); long maxMemory = Runtime.getRuntime().maxMemory() / 1024 / 1024; System.out.println("Java runtime max memory: " + maxMemory + " MB."); // Build a map and assign a dictionary index to each term. // Include only those terms that survive the min term freq cutoff. Map<String, Integer> dictMap = new TreeMap<>(); File file = null; System.out.println("Processing index..."); try { file = new File(opts.indexDir); IndexReader reader = DirectoryReader.open(FSDirectory.open(file)); TermsEnum te = null; int nnz = 0, numCols = 0, maxDocs = reader.maxDoc(); LinkedList<FeatureVector> matrixData = new LinkedList<>(); // add other fields Collection<String> fields = new ArrayList<>(); if (opts.fields > 0) { fields = MultiFields.getIndexedFields(reader); fields.remove(CONTENTSFIELD); fields.remove(PATHFIELD); } if (!extractTerms(reader, dictMap, opts.minTermFreq, maxDocs - 1, opts.maxTermPercentage)) System.exit(1); // set of field names to extract Set<String> fieldSet = new HashSet<>(); fieldSet.add(PATHFIELD); for (String s : fields) { fieldSet.add(s); } for (int i = 0; i < maxDocs; ++i) { // get term vector for next document Terms terms = reader.getTermVector(i, CONTENTSFIELD); if (terms == null) continue; te = terms.iterator(te); FeatureVector fv = new FeatureVector(numCols); int numEntries = buildFeatureVector(fv, te, dictMap); if (numEntries > 0) { // extract document path and save with FeatureVector Document doc = reader.document(i, fieldSet); fv.docPath = doc.get(PATHFIELD); // add any additional fields for (String s : fields) { fv.fields.put(s, doc.get(s)); } //System.out.println("processing document:" + fv.docPath); matrixData.add(fv); nnz += numEntries; ++numCols; } } // Sort the feature vectors by their document path field. Write // the matrix columns in this sorted order. Collections.sort(matrixData, new FeatureVectorComparator()); File outdir = new File(opts.outDir); writeMatrixMarketFile(new File(outdir, MATRIXFILE), matrixData, dictMap.size(), numCols, nnz); System.out.println("Wrote " + MATRIXFILE + "."); writeDictionaryFile(new File(outdir, DICTFILE), dictMap); System.out.println("Wrote " + DICTFILE + "."); writeDocumentFile(new File(outdir, DOCFILE), matrixData); System.out.println("Wrote " + DOCFILE + "."); writeFieldFiles(outdir, fields, matrixData); } catch (IndexNotFoundException e) { if (null != file) { System.out.println("Lucene index not found in: " + file.getAbsolutePath()); } } catch (IOException e) { System.out.println("LuceneToMtx exception: caught a " + e.getClass() + "\nMessage: " + e.getMessage()); } }
From source file:org.apache.solr.handler.admin.LukeRequestHandler.java
License:Apache License
private static SimpleOrderedMap<Object> getDocumentFieldsInfo(Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException { final CharsRef spare = new CharsRef(); SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<Object>(); for (Object o : doc.getFields()) { Field field = (Field) o; SimpleOrderedMap<Object> f = new SimpleOrderedMap<Object>(); SchemaField sfield = schema.getFieldOrNull(field.name()); FieldType ftype = (sfield == null) ? null : sfield.getType(); f.add("type", (ftype == null) ? null : ftype.getTypeName()); f.add("schema", getFieldFlags(sfield)); f.add("flags", getFieldFlags(field)); Term t = new Term(field.name(), ftype != null ? ftype.storedToIndexed(field) : field.stringValue()); f.add("value", (ftype == null) ? null : ftype.toExternal(field)); // TODO: this really should be "stored" f.add("internal", field.stringValue()); // may be a binary number BytesRef bytes = field.binaryValue(); if (bytes != null) { f.add("binary", Base64.byteArrayToBase64(bytes.bytes, bytes.offset, bytes.length)); }/*from ww w .ja v a 2 s. c om*/ f.add("boost", field.boost()); f.add("docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this can be 0 for non-indexed fields // If we have a term vector, return that if (field.fieldType().storeTermVectors()) { try { Terms v = reader.getTermVector(docId, field.name()); if (v != null) { SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<Integer>(); final TermsEnum termsEnum = v.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { final int freq = (int) termsEnum.totalTermFreq(); UnicodeUtil.UTF8toUTF16(text, spare); tfv.add(spare.toString(), freq); } f.add("termVector", tfv); } } catch (Exception ex) { log.warn("error writing term vector", ex); } } finfo.add(field.name(), f); } return finfo; }
From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java
License:Open Source License
private static SimpleOrderedMap<Object> getDocumentFieldsInfo(Document doc, int docId, IndexReader reader, IndexSchema schema) throws IOException { final CharsRefBuilder spare = new CharsRefBuilder(); SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<>(); for (Object o : doc.getFields()) { Field field = (Field) o; SimpleOrderedMap<Object> f = new SimpleOrderedMap<>(); SchemaField sfield = schema.getFieldOrNull(field.name()); FieldType ftype = (sfield == null) ? null : sfield.getType(); f.add("type", (ftype == null) ? null : ftype.getTypeName()); f.add("schema", getFieldFlags(sfield)); f.add("flags", getFieldFlags(field)); Term t = new Term(field.name(), ftype != null ? ftype.storedToIndexed(field) : field.stringValue()); f.add("value", (ftype == null) ? null : ftype.toExternal(field)); // TODO: this really should be "stored" f.add("internal", field.stringValue()); // may be a binary number BytesRef bytes = field.binaryValue(); if (bytes != null) { f.add("binary", Base64.byteArrayToBase64(bytes.bytes, bytes.offset, bytes.length)); }/* w w w . j a v a2 s .c o m*/ f.add("boost", field.boost()); f.add("docFreq", t.text() == null ? 0 : reader.docFreq(t)); // this // can // be 0 // for // non-indexed // fields // If we have a term vector, return that if (field.fieldType().storeTermVectors()) { try { Terms v = reader.getTermVector(docId, field.name()); if (v != null) { SimpleOrderedMap<Integer> tfv = new SimpleOrderedMap<>(); final TermsEnum termsEnum = v.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { final int freq = (int) termsEnum.totalTermFreq(); spare.copyUTF8Bytes(text); tfv.add(spare.toString(), freq); } f.add("termVector", tfv); } } catch (Exception ex) { log.warn("error writing term vector", ex); } } finfo.add(field.name(), f); } return finfo; }
From source file:org.apache.solr.handler.component.TermVectorComponent.java
License:Apache License
@Override public void process(ResponseBuilder rb) throws IOException { SolrParams params = rb.req.getParams(); if (!params.getBool(COMPONENT_NAME, false)) { return;/* ww w . j a v a 2 s.c o m*/ } NamedList<Object> termVectors = new NamedList<Object>(); rb.rsp.add(TERM_VECTORS, termVectors); IndexSchema schema = rb.req.getSchema(); SchemaField keyField = schema.getUniqueKeyField(); String uniqFieldName = null; if (keyField != null) { uniqFieldName = keyField.getName(); termVectors.add("uniqueKeyFieldName", uniqFieldName); } FieldOptions allFields = new FieldOptions(); //figure out what options we have, and try to get the appropriate vector allFields.termFreq = params.getBool(TermVectorParams.TF, false); allFields.positions = params.getBool(TermVectorParams.POSITIONS, false); allFields.offsets = params.getBool(TermVectorParams.OFFSETS, false); allFields.docFreq = params.getBool(TermVectorParams.DF, false); allFields.tfIdf = params.getBool(TermVectorParams.TF_IDF, false); //boolean cacheIdf = params.getBool(TermVectorParams.IDF, false); //short cut to all values. if (params.getBool(TermVectorParams.ALL, false)) { allFields.termFreq = true; allFields.positions = true; allFields.offsets = true; allFields.docFreq = true; allFields.tfIdf = true; } //Build up our per field mapping Map<String, FieldOptions> fieldOptions = new HashMap<String, FieldOptions>(); NamedList<List<String>> warnings = new NamedList<List<String>>(); List<String> noTV = new ArrayList<String>(); List<String> noPos = new ArrayList<String>(); List<String> noOff = new ArrayList<String>(); Set<String> fields = getFields(rb); if (null != fields) { //we have specific fields to retrieve, or no fields for (String field : fields) { // workarround SOLR-3523 if (null == field || "score".equals(field)) continue; // we don't want to issue warnings about the uniqueKey field // since it can cause lots of confusion in distributed requests // where the uniqueKey field is injected into the fl for merging final boolean fieldIsUniqueKey = field.equals(uniqFieldName); SchemaField sf = schema.getFieldOrNull(field); if (sf != null) { if (sf.storeTermVector()) { FieldOptions option = fieldOptions.get(field); if (option == null) { option = new FieldOptions(); option.fieldName = field; fieldOptions.put(field, option); } //get the per field mappings option.termFreq = params.getFieldBool(field, TermVectorParams.TF, allFields.termFreq); option.docFreq = params.getFieldBool(field, TermVectorParams.DF, allFields.docFreq); option.tfIdf = params.getFieldBool(field, TermVectorParams.TF_IDF, allFields.tfIdf); //Validate these are even an option option.positions = params.getFieldBool(field, TermVectorParams.POSITIONS, allFields.positions); if (option.positions && !sf.storeTermPositions() && !fieldIsUniqueKey) { noPos.add(field); } option.offsets = params.getFieldBool(field, TermVectorParams.OFFSETS, allFields.offsets); if (option.offsets && !sf.storeTermOffsets() && !fieldIsUniqueKey) { noOff.add(field); } } else {//field doesn't have term vectors if (!fieldIsUniqueKey) noTV.add(field); } } else { //field doesn't exist throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "undefined field: " + field); } } } //else, deal with all fields // NOTE: currently all typs of warnings are schema driven, and garunteed // to be consistent across all shards - if additional types of warnings // are added that might be differnet between shards, finishStage() needs // to be changed to account for that. boolean hasWarnings = false; if (!noTV.isEmpty()) { warnings.add("noTermVectors", noTV); hasWarnings = true; } if (!noPos.isEmpty()) { warnings.add("noPositions", noPos); hasWarnings = true; } if (!noOff.isEmpty()) { warnings.add("noOffsets", noOff); hasWarnings = true; } if (hasWarnings) { termVectors.add("warnings", warnings); } DocListAndSet listAndSet = rb.getResults(); List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS)); Iterator<Integer> iter; if (docIds != null && !docIds.isEmpty()) { iter = docIds.iterator(); } else { DocList list = listAndSet.docList; iter = list.iterator(); } SolrIndexSearcher searcher = rb.req.getSearcher(); IndexReader reader = searcher.getIndexReader(); //the TVMapper is a TermVectorMapper which can be used to optimize loading of Term Vectors //Only load the id field to get the uniqueKey of that //field final String finalUniqFieldName = uniqFieldName; final List<String> uniqValues = new ArrayList<String>(); // TODO: is this required to be single-valued? if so, we should STOP // once we find it... final StoredFieldVisitor getUniqValue = new StoredFieldVisitor() { @Override public void stringField(FieldInfo fieldInfo, String value) { uniqValues.add(value); } @Override public void intField(FieldInfo fieldInfo, int value) { uniqValues.add(Integer.toString(value)); } @Override public void longField(FieldInfo fieldInfo, long value) { uniqValues.add(Long.toString(value)); } @Override public Status needsField(FieldInfo fieldInfo) { return (fieldInfo.name.equals(finalUniqFieldName)) ? Status.YES : Status.NO; } }; TermsEnum termsEnum = null; while (iter.hasNext()) { Integer docId = iter.next(); NamedList<Object> docNL = new NamedList<Object>(); if (keyField != null) { reader.document(docId, getUniqValue); String uniqVal = null; if (uniqValues.size() != 0) { uniqVal = uniqValues.get(0); uniqValues.clear(); docNL.add("uniqueKey", uniqVal); termVectors.add(uniqVal, docNL); } } else { // support for schemas w/o a unique key, termVectors.add("doc-" + docId, docNL); } if (null != fields) { for (Map.Entry<String, FieldOptions> entry : fieldOptions.entrySet()) { final String field = entry.getKey(); final Terms vector = reader.getTermVector(docId, field); if (vector != null) { termsEnum = vector.iterator(termsEnum); mapOneVector(docNL, entry.getValue(), reader, docId, vector.iterator(termsEnum), field); } } } else { // extract all fields final Fields vectors = reader.getTermVectors(docId); for (String field : vectors) { Terms terms = vectors.terms(field); if (terms != null) { termsEnum = terms.iterator(termsEnum); mapOneVector(docNL, allFields, reader, docId, termsEnum, field); } } } } }
From source file:org.nlp4l.lucene.LuceneDocTermVector.java
License:Apache License
/** * Lucene????/*w w w . j a v a 2s . co m*/ * * @param reader ??Lucene???{@link IndexReader} * @param docId ???LuceneID * @param fieldName ???Lucene?? * @param size ? * @param termsReuse null????? * @param liveDocs null????? * @param twf null???{@link DefaultTfIdfTermWeightFactory}??? * @param stopWords ??????????????????null???? * @throws IOException */ public LuceneDocTermVector(IndexReader reader, int docId, String fieldName, int size, Terms termsReuse, Bits liveDocs, TermWeightFactory twf, Set<String> stopWords) throws IOException { liveDocs = liveDocs == null ? MultiFields.getLiveDocs(reader) : liveDocs; twf = twf == null ? new DefaultTfIdfTermWeightFactory(reader, docId, fieldName, liveDocs) : twf; queue = new TermWeightQueue(size); if (termsReuse == null) termsReuse = reader.getTermVector(docId, fieldName); TermsEnum termsEnum = termsReuse.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { // candidate feature term final String term = text.utf8ToString(); if (stopWords != null && stopWords.contains(term)) continue; final TermWeight termWeight = twf.create(text); if (termWeight == null) continue; Map.Entry<String, TermWeight> entry = new Map.Entry<String, TermWeight>() { public String getKey() { return term; } public TermWeight getValue() { return termWeight; } public TermWeight setValue(TermWeight arg0) { // TODO Auto-generated method stub return null; } }; queue.insertWithOverflow(entry); } }
From source file:org.ohdsi.usagi.tests.TestLucene.java
License:Apache License
public static void main(String[] args) throws IOException, ParseException { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9); //Analyzer analyzer = new UsagiAnalyzer(); FieldType textVectorField = new FieldType(); textVectorField.setIndexed(true);// www . j a va 2s . co m textVectorField.setTokenized(true); textVectorField.setStoreTermVectors(true); textVectorField.setStoreTermVectorPositions(false); textVectorField.setStoreTermVectorPayloads(false); textVectorField.setStoreTermVectorOffsets(false); textVectorField.setStored(true); textVectorField.freeze(); File indexFolder = new File(folder); if (indexFolder.exists()) DirectoryUtilities.deleteDir(indexFolder); Directory dir = FSDirectory.open(indexFolder); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); iwc.setOpenMode(OpenMode.CREATE); iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); Document doc = new Document(); doc.add(new Field("F", "word1 word2 w3 word4", textVectorField)); writer.addDocument(doc); doc = new Document(); doc.add(new Field("F", "word1 word2 w3", textVectorField)); writer.addDocument(doc); writer.close(); IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(folder))); for (int i = 0; i < reader.numDocs(); i++) { TermsEnum termsEnum = reader.getTermVector(i, "F").iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { System.out.print(text.utf8ToString() + ","); } System.out.println(); } IndexSearcher searcher = new IndexSearcher(reader); // MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader()); // mlt.setMinTermFreq(0); // mlt.setMinDocFreq(0); // mlt.setMaxDocFreq(9999); // mlt.setMinWordLen(0); // mlt.setMaxWordLen(9999); // mlt.setMaxDocFreqPct(100); // mlt.setMaxNumTokensParsed(9999); // mlt.setMaxQueryTerms(9999); // mlt.setStopWords(null); // mlt.setFieldNames(new String[] { "F" }); // mlt.setAnalyzer(new UsagiAnalyzer()); // Query query = mlt.like("F", new StringReader("Systolic blood pressure")); QueryParser parser = new QueryParser(Version.LUCENE_4_9, "F", analyzer); Query query = parser.parse("word1"); Explanation explanation = searcher.explain(query, 0); print(explanation); System.out.println(); explanation = searcher.explain(query, 1); print(explanation); System.out.println(); TopDocs topDocs = searcher.search(query, 99); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { System.out.println(scoreDoc.score + "\t" + reader.document(scoreDoc.doc).get("F")); } }
From source file:retriever.TermWt.java
DocVector(IndexReader reader, int docId) throws Exception { this.reader = reader; Terms terms = reader.getTermVector(docId, FIELD_ANALYZED_CONTENT); TermsEnum termsEnum;/*from ww w . j a v a2s .c om*/ BytesRef term; List<TermWt> tfvec = new ArrayList<>(); // Construct the normalized tf vector termsEnum = terms.iterator(null); // access the terms for this field while ((term = termsEnum.next()) != null) { // explore the terms for this field String termStr = term.utf8ToString(); DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { //get the term frequency in the document int tf = docsEnum.freq(); tfvec.add(new TermWt(termStr, tf)); } } Collections.sort(tfvec); vec = new TermWt[tfvec.size()]; vec = tfvec.toArray(vec); }
From source file:trustframework.evidence.github.ConversationMimicry.java
private Map<String, Integer> getFrequencyMap(IndexReader ir, Integer docIndex) throws IOException { Terms frequencyVector = ir.getTermVector(docIndex, "Content"); TermsEnum termsIterator = frequencyVector.iterator(); Map<String, Integer> frequencyMap = new HashMap<>(); BytesRef text;// w ww . j a v a2s . co m while ((text = termsIterator.next()) != null) { String term = text.utf8ToString(); int freq = (int) termsIterator.totalTermFreq(); frequencyMap.put(term, freq); } return frequencyMap; }
From source file:vectorizer.TermInfo.java
private DocVector buildTerms(IndexReader reader, int docId, int numDocs, Dictionary dict) throws Exception { DocVector wmap = new DocVector(reader.document(docId).get(ID_FIELD_NAME)); Terms tfvector;/* w w w. java 2 s .c o m*/ TermsEnum termsEnum; String termText; BytesRef term; int tf; float idf; tfvector = reader.getTermVector(docId, CONTENT_FIELD_NAME); if (tfvector == null) return null; // Construct the normalized tf vector termsEnum = tfvector.iterator(); // access the terms for this field while ((term = termsEnum.next()) != null) { // explore the terms for this field tf = (int) termsEnum.totalTermFreq(); termText = term.utf8ToString(); float df = reader.docFreq(new Term(CONTENT_FIELD_NAME, termText)); idf = (float) Math.log(1 + numDocs / df); TermInfo termInfo = new TermInfo(termText, tf, getTermId(termText), idf); if (dict != null) { Translations translations = dict.getTranslationTerms(termText); for (TranslationInfo tinfo : translations.getTranslationInfo()) { termInfo.tf *= tinfo.weight; } } // Update global stats TermInfo seenTermInfo = collFreq.get(termText); if (seenTermInfo == null) { seenTermInfo = new TermInfo(termInfo.term, termInfo.tf, termInfo.id, termInfo.idf); collFreq.put(termText, seenTermInfo); } else { seenTermInfo.tf += termInfo.tf; // coll freq } wmap.addTermInfo(termInfo); } return wmap; }