List of usage examples for org.apache.lucene.index Fields iterator
@Override public abstract Iterator<String> iterator();
From source file:com.floragunn.searchguard.configuration.DlsFlsFilterLeafReader.java
License:Open Source License
@Override public Fields fields() throws IOException { final Fields fields = in.fields(); if (!flsEnabled) { return fields; }/*from w w w . ja v a 2 s. co m*/ return new Fields() { @Override public Iterator<String> iterator() { return Iterators.<String>filter(fields.iterator(), new Predicate<String>() { @Override public boolean apply(final String input) { return isFls(input); } }); } @Override public Terms terms(final String field) throws IOException { if (!isFls(field)) { return null; } return in.terms(field); } @Override public int size() { return flsFieldInfos.size(); } }; }
From source file:com.floragunn.searchguard.configuration.DlsFlsFilterLeafReader.java
License:Open Source License
@Override public Fields getTermVectors(final int docID) throws IOException { final Fields fields = in.getTermVectors(docID); if (!flsEnabled || fields == null) { return fields; }/*from w ww . j a v a 2 s .co m*/ return new Fields() { @Override public Iterator<String> iterator() { return Iterators.<String>filter(fields.iterator(), new Predicate<String>() { @Override public boolean apply(final String input) { return isFls(input); } }); } @Override public Terms terms(final String field) throws IOException { if (!isFls(field)) { return null; } return in.terms(field); } @Override public int size() { return flsFieldInfos.size(); } }; }
From source file:de.tuberlin.dima.cuttlefish.preprocessing.vectorization.Vectorizer.java
License:Open Source License
public void vectorize(File luceneIndexDir, File outputDir) throws Exception { Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); SequenceFile.Writer writer = null; FeatureDictionary dict = new FeatureDictionary(); DirectoryReader reader = null;//from w ww . j ava 2 s . c om try { reader = DirectoryReader.open(new SimpleFSDirectory(luceneIndexDir)); writer = SequenceFile.createWriter(fs, conf, new Path(outputDir.toString(), "documentVectors.seq"), IDAndCodes.class, VectorWritable.class); IDAndCodes idAndCodes = new IDAndCodes(); VectorWritable vectorWritable = new VectorWritable(); Fields fields = MultiFields.getFields(reader); if (fields != null) { Iterator<String> fieldNames = fields.iterator(); while (fieldNames.hasNext()) { String field = fieldNames.next(); if (!field.startsWith("bip:") && !"itemID".equals(field)) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { dict.addTextFeature(field, text.utf8ToString()); } } } } int numDocsVectorized = 0; for (int docID = 0; docID < reader.maxDoc(); docID++) { Document doc = reader.document(docID); int itemID = doc.getField("itemID").numericValue().intValue(); RandomAccessSparseVector documentVector = new RandomAccessSparseVector(dict.numFeatures()); Multimap<String, String> codes = HashMultimap.create(); for (IndexableField field : doc.getFields()) { String fieldName = field.name(); if (!fieldName.startsWith("bip:") && !"itemID".equals(fieldName)) { Terms termFreqVector = reader.getTermVector(docID, fieldName); if (termFreqVector != null) { int maxTermFrequency = maxTermFrequency(termFreqVector); TermsEnum te = termFreqVector.iterator(null); BytesRef term; while ((term = te.next()) != null) { String termStr = term.utf8ToString(); int termFrequency = (int) te.totalTermFreq(); int documentFrequency = reader.docFreq(new Term(fieldName, term)); int numDocs = reader.numDocs(); double weight = weighting.weight(fieldName, termStr, termFrequency, documentFrequency, maxTermFrequency, numDocs); int featureIndex = dict.index(fieldName, term.utf8ToString()); documentVector.setQuick(featureIndex, weight); } } } else if (fieldName.startsWith("bip:")) { for (String value : doc.getValues(fieldName)) { codes.put(fieldName, value); } } } Vector featureVector = new SequentialAccessSparseVector(documentVector); weighting.normalize(featureVector); idAndCodes.set(itemID, codes); vectorWritable.set(featureVector); writer.append(idAndCodes, vectorWritable); numDocsVectorized++; if (numDocsVectorized % 100 == 0) { log.info("Vectorized {} documents", numDocsVectorized); } } log.info("Vectorized {} documents", numDocsVectorized); dict.writeToFile(new File(outputDir, "features.txt")); log.info("Wrote feature dictionary"); } finally { Closeables.close(reader, true); Closeables.close(writer, true); } }
From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.lucene.CorpusHighFreqTerms.java
License:Open Source License
/** * * @param reader/*from ww w .j a v a 2 s. co m*/ * @param numTerms * @param field * @return TermStats[] ordered by terms with highest docFreq first. * @throws Exception */ public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field) throws Exception { TermStatsQueue tiq = null; if (field != null) { Fields fields = MultiFields.getFields(reader); if (fields == null) { throw new RuntimeException("field " + field + " not found"); } Terms terms = fields.terms(field); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); tiq = new TermStatsQueue(numTerms); tiq.fill(field, termsEnum); } } else { Fields fields = MultiFields.getFields(reader); if (fields == null) { throw new RuntimeException("no fields found for this index"); } tiq = new TermStatsQueue(numTerms); FieldsEnum fieldsEnum = fields.iterator(); while (true) { field = fieldsEnum.next(); if (field != null) { Terms terms = fieldsEnum.terms(); if (terms != null) { tiq.fill(field, terms.iterator(null)); } } else { break; } } } TermStats[] result = new TermStats[tiq.size()]; // we want highest first so we read the queue and populate the array // starting at the end and work backwards int count = tiq.size() - 1; while (tiq.size() != 0) { result[count] = tiq.pop(); count--; } return result; }
From source file:io.datalayer.lucene.frequency.AosFrequencyTerms.java
License:Apache License
/** * //from w w w. j a va 2s . c o m * @param reader * @param numTerms * @param field * @return TermStats[] ordered by terms with highest docFreq first. * @throws Exception */ public static AosTermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames) throws Exception { TermStatsQueue tiq = null; TermsEnum te = null; if (fieldNames != null) { Fields fields = MultiFields.getFields(reader); if (fields == null) { LOGGER.info("Index with no fields - probably empty or corrupted"); return EMPTY_STATS; } tiq = new TermStatsQueue(numTerms); for (String field : fieldNames) { Terms terms = fields.terms(field); if (terms != null) { te = terms.iterator(te); fillQueue(te, tiq, field); } } } else { Fields fields = MultiFields.getFields(reader); if (fields == null) { LOGGER.info("Index with no fields - probably empty or corrupted"); return EMPTY_STATS; } tiq = new TermStatsQueue(numTerms); Iterator<String> fieldsEnum = fields.iterator(); while (true) { /* * String field = fieldsEnum.next(); * * if (field != null) { Terms terms = fieldsEnum.terms(); te = * terms.iterator(te); fillQueue(te, tiq, field); } else { * break; } */} } AosTermStats[] result = new AosTermStats[tiq.size()]; // we want highest first so we read the queue and populate the array // starting at the end and work backwards int count = tiq.size() - 1; while (tiq.size() != 0) { result[count] = tiq.pop(); count--; } return result; }
From source file:io.datalayer.lucene.index.IndexHtmlFilesMain.java
License:Apache License
private static void indexDocs(File file, File index, boolean create) throws Exception { if (!create) { // incrementally update reader = DirectoryReader.open(FSDirectory.open(index)); // open // existing // index Fields fields = MultiFields.getFields(reader); Iterator<String> fieldsEnum = fields.iterator(); // uidIter = reader.terms(new Term("uid", "")); // init uid iterator /*/*from ww w . j av a 2 s .c o m*/ * uidIter = fieldsEnum.terms(); */ indexDocs(file); if (deleting) { // delete rest of stale docs /* * while (uidIter.term() != null && uidIter.term().field() == * "uid") { LOGGER.info("deleting " + * HTMLDocument.uid2url(uidIter.term().text())); * reader.deleteDocuments(uidIter.term()); uidIter.next(); } */ deleting = false; } reader.close(); // close existing index } else { indexDocs(file); } }
From source file:narock.HighFreqTerms.java
License:Apache License
/** * /* w w w. java2 s . c o m*/ * @param reader * @param numTerms * @param field * @return TermStats[] ordered by terms with highest docFreq first. * @throws Exception */ public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames) throws Exception { TermStatsQueue tiq = null; TermsEnum te = null; if (fieldNames != null) { Fields fields = MultiFields.getFields(reader); if (fields == null) { LOG.info("Index with no fields - probably empty or corrupted"); return EMPTY_STATS; } tiq = new TermStatsQueue(numTerms); for (String field : fieldNames) { Terms terms = fields.terms(field); if (terms != null) { te = terms.iterator(te); fillQueue(te, tiq, field); } } } else { Fields fields = MultiFields.getFields(reader); if (fields == null) { LOG.info("Index with no fields - probably empty or corrupted"); return EMPTY_STATS; } tiq = new TermStatsQueue(numTerms); // FieldsEnum fieldsEnum = fields.iterator(); while (true) { String field = fields.iterator().next(); //fieldsEnum.next(); if (field != null) { Terms terms = fields.terms(field); //fieldsEnum.terms(); te = terms.iterator(te); fillQueue(te, tiq, field); } else { break; } } } TermStats[] result = new TermStats[tiq.size()]; // we want highest first so we read the queue and populate the array // starting at the end and work backwards int count = tiq.size() - 1; while (tiq.size() != 0) { result[count] = tiq.pop(); count--; } return result; }
From source file:org.elasticsearch.action.termlist.TransportTermlistAction.java
License:Apache License
@Override protected ShardTermlistResponse shardOperation(ShardTermlistRequest request) throws ElasticSearchException { synchronized (termlistMutex) { InternalIndexShard indexShard = (InternalIndexShard) indicesService.indexServiceSafe(request.index()) .shardSafe(request.shardId()); indexShard.store().directory();//w ww .j a va2 s . c o m Engine.Searcher searcher = indexShard.searcher(); try { Set<String> set = new CompactHashSet(); Fields fields = MultiFields.getFields(searcher.reader()); if (fields != null) { for (Iterator<String> it = fields.iterator(); it.hasNext();) { String field = it.next(); if (field.charAt(0) == '_') { continue; } if (request.getField() == null || field.equals(request.getField())) { Terms terms = fields.terms(field); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { set.add(text.utf8ToString()); System.out.println("field=" + field + "; text=" + text.utf8ToString()); } } } } } return new ShardTermlistResponse(request.index(), request.shardId(), set); } catch (IOException ex) { throw new ElasticSearchException(ex.getMessage(), ex); } } }
From source file:org.elasticsearch.action.termvector.TermVectorResponse.java
License:Apache License
@Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { assert index != null; assert type != null; assert id != null; builder.startObject();/*from w w w . ja va 2 s .com*/ builder.field(FieldStrings._INDEX, index); builder.field(FieldStrings._TYPE, type); builder.field(FieldStrings._ID, id); builder.field(FieldStrings._VERSION, docVersion); builder.field(FieldStrings.FOUND, isExists()); if (!isExists()) { builder.endObject(); return builder; } builder.startObject(FieldStrings.TERM_VECTORS); final CharsRef spare = new CharsRef(); Fields theFields = getFields(); Iterator<String> fieldIter = theFields.iterator(); while (fieldIter.hasNext()) { buildField(builder, spare, theFields, fieldIter); } builder.endObject(); builder.endObject(); return builder; }
From source file:org.elasticsearch.action.termvectors.TermVectorsResponse.java
License:Apache License
@Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { assert index != null; assert type != null; assert id != null; builder.field(FieldStrings._INDEX, index); builder.field(FieldStrings._TYPE, type); if (!isArtificial()) { builder.field(FieldStrings._ID, id); }/*from w ww . j a va 2 s. c om*/ builder.field(FieldStrings._VERSION, docVersion); builder.field(FieldStrings.FOUND, isExists()); builder.field(FieldStrings.TOOK, tookInMillis); if (!isExists()) { return builder; } builder.startObject(FieldStrings.TERM_VECTORS); final CharsRefBuilder spare = new CharsRefBuilder(); Fields theFields = getFields(); Iterator<String> fieldIter = theFields.iterator(); while (fieldIter.hasNext()) { buildField(builder, spare, theFields, fieldIter); } builder.endObject(); return builder; }