Example usage for org.apache.lucene.index Fields iterator

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields iterator.

Prototype

@Override
public abstract Iterator<String> iterator();

Source Link

Document

Returns an iterator that will step through all fields names.

Usage

From source file:com.floragunn.searchguard.configuration.DlsFlsFilterLeafReader.java

License:Open Source License

@Override
public Fields fields() throws IOException {
    final Fields fields = in.fields();

    if (!flsEnabled) {
        return fields;
    }/*from   w  w w  .  ja  v a 2  s.  co  m*/

    return new Fields() {

        @Override
        public Iterator<String> iterator() {
            return Iterators.<String>filter(fields.iterator(), new Predicate<String>() {

                @Override
                public boolean apply(final String input) {
                    return isFls(input);
                }
            });
        }

        @Override
        public Terms terms(final String field) throws IOException {

            if (!isFls(field)) {
                return null;
            }

            return in.terms(field);

        }

        @Override
        public int size() {
            return flsFieldInfos.size();
        }

    };
}

From source file:com.floragunn.searchguard.configuration.DlsFlsFilterLeafReader.java

License:Open Source License

@Override
public Fields getTermVectors(final int docID) throws IOException {
    final Fields fields = in.getTermVectors(docID);

    if (!flsEnabled || fields == null) {
        return fields;
    }/*from  w ww  .  j  a  v  a  2 s  .co  m*/

    return new Fields() {

        @Override
        public Iterator<String> iterator() {
            return Iterators.<String>filter(fields.iterator(), new Predicate<String>() {

                @Override
                public boolean apply(final String input) {
                    return isFls(input);
                }
            });
        }

        @Override
        public Terms terms(final String field) throws IOException {

            if (!isFls(field)) {
                return null;
            }

            return in.terms(field);

        }

        @Override
        public int size() {
            return flsFieldInfos.size();
        }

    };
}

From source file:de.tuberlin.dima.cuttlefish.preprocessing.vectorization.Vectorizer.java

License:Open Source License

public void vectorize(File luceneIndexDir, File outputDir) throws Exception {

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.getLocal(conf);
    SequenceFile.Writer writer = null;

    FeatureDictionary dict = new FeatureDictionary();

    DirectoryReader reader = null;//from   w  ww .  j ava  2  s . c  om
    try {
        reader = DirectoryReader.open(new SimpleFSDirectory(luceneIndexDir));

        writer = SequenceFile.createWriter(fs, conf, new Path(outputDir.toString(), "documentVectors.seq"),
                IDAndCodes.class, VectorWritable.class);
        IDAndCodes idAndCodes = new IDAndCodes();
        VectorWritable vectorWritable = new VectorWritable();

        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Iterator<String> fieldNames = fields.iterator();
            while (fieldNames.hasNext()) {
                String field = fieldNames.next();
                if (!field.startsWith("bip:") && !"itemID".equals(field)) {

                    Terms terms = fields.terms(field);
                    TermsEnum termsEnum = terms.iterator(null);
                    BytesRef text;
                    while ((text = termsEnum.next()) != null) {
                        dict.addTextFeature(field, text.utf8ToString());
                    }
                }
            }
        }

        int numDocsVectorized = 0;

        for (int docID = 0; docID < reader.maxDoc(); docID++) {
            Document doc = reader.document(docID);

            int itemID = doc.getField("itemID").numericValue().intValue();

            RandomAccessSparseVector documentVector = new RandomAccessSparseVector(dict.numFeatures());
            Multimap<String, String> codes = HashMultimap.create();

            for (IndexableField field : doc.getFields()) {

                String fieldName = field.name();

                if (!fieldName.startsWith("bip:") && !"itemID".equals(fieldName)) {

                    Terms termFreqVector = reader.getTermVector(docID, fieldName);

                    if (termFreqVector != null) {

                        int maxTermFrequency = maxTermFrequency(termFreqVector);

                        TermsEnum te = termFreqVector.iterator(null);
                        BytesRef term;

                        while ((term = te.next()) != null) {

                            String termStr = term.utf8ToString();
                            int termFrequency = (int) te.totalTermFreq();

                            int documentFrequency = reader.docFreq(new Term(fieldName, term));
                            int numDocs = reader.numDocs();

                            double weight = weighting.weight(fieldName, termStr, termFrequency,
                                    documentFrequency, maxTermFrequency, numDocs);

                            int featureIndex = dict.index(fieldName, term.utf8ToString());
                            documentVector.setQuick(featureIndex, weight);
                        }
                    }

                } else if (fieldName.startsWith("bip:")) {
                    for (String value : doc.getValues(fieldName)) {
                        codes.put(fieldName, value);
                    }
                }
            }

            Vector featureVector = new SequentialAccessSparseVector(documentVector);

            weighting.normalize(featureVector);

            idAndCodes.set(itemID, codes);
            vectorWritable.set(featureVector);
            writer.append(idAndCodes, vectorWritable);

            numDocsVectorized++;
            if (numDocsVectorized % 100 == 0) {
                log.info("Vectorized {} documents", numDocsVectorized);
            }
        }

        log.info("Vectorized {} documents", numDocsVectorized);

        dict.writeToFile(new File(outputDir, "features.txt"));

        log.info("Wrote feature dictionary");

    } finally {
        Closeables.close(reader, true);
        Closeables.close(writer, true);
    }

}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.lucene.CorpusHighFreqTerms.java

License:Open Source License

/**
 *
 * @param reader/*from  ww w  .j  a  v a 2  s. co  m*/
 * @param numTerms
 * @param field
 * @return TermStats[] ordered by terms with highest docFreq first.
 * @throws Exception
 */
public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field) throws Exception {
    TermStatsQueue tiq = null;

    if (field != null) {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            throw new RuntimeException("field " + field + " not found");
        }
        Terms terms = fields.terms(field);
        if (terms != null) {
            TermsEnum termsEnum = terms.iterator(null);
            tiq = new TermStatsQueue(numTerms);
            tiq.fill(field, termsEnum);
        }
    } else {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            throw new RuntimeException("no fields found for this index");
        }
        tiq = new TermStatsQueue(numTerms);
        FieldsEnum fieldsEnum = fields.iterator();
        while (true) {
            field = fieldsEnum.next();
            if (field != null) {
                Terms terms = fieldsEnum.terms();
                if (terms != null) {
                    tiq.fill(field, terms.iterator(null));
                }
            } else {
                break;
            }
        }
    }

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
        result[count] = tiq.pop();
        count--;
    }
    return result;
}

From source file:io.datalayer.lucene.frequency.AosFrequencyTerms.java

License:Apache License

/**
 * //from   w  w  w.  j a va  2s  . c  o m
 * @param reader
 * @param numTerms
 * @param field
 * @return TermStats[] ordered by terms with highest docFreq first.
 * @throws Exception
 */
public static AosTermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames)
        throws Exception {
    TermStatsQueue tiq = null;
    TermsEnum te = null;

    if (fieldNames != null) {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            LOGGER.info("Index with no fields - probably empty or corrupted");
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        for (String field : fieldNames) {
            Terms terms = fields.terms(field);
            if (terms != null) {
                te = terms.iterator(te);
                fillQueue(te, tiq, field);
            }
        }
    } else {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            LOGGER.info("Index with no fields - probably empty or corrupted");
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        Iterator<String> fieldsEnum = fields.iterator();
        while (true) {
            /*
             * String field = fieldsEnum.next();
             * 
             * if (field != null) { Terms terms = fieldsEnum.terms(); te =
             * terms.iterator(te); fillQueue(te, tiq, field); } else {
             * break; }
             */}
    }

    AosTermStats[] result = new AosTermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
        result[count] = tiq.pop();
        count--;
    }
    return result;
}

From source file:io.datalayer.lucene.index.IndexHtmlFilesMain.java

License:Apache License

private static void indexDocs(File file, File index, boolean create) throws Exception {
    if (!create) { // incrementally update
        reader = DirectoryReader.open(FSDirectory.open(index)); // open
                                                                // existing
                                                                // index

        Fields fields = MultiFields.getFields(reader);
        Iterator<String> fieldsEnum = fields.iterator();
        // uidIter = reader.terms(new Term("uid", "")); // init uid iterator
        /*/*from ww  w  .  j  av a  2  s  .c o  m*/
         * uidIter = fieldsEnum.terms();
         */
        indexDocs(file);

        if (deleting) { // delete rest of stale docs
            /*
             * while (uidIter.term() != null && uidIter.term().field() ==
             * "uid") { LOGGER.info("deleting " +
             * HTMLDocument.uid2url(uidIter.term().text()));
             * reader.deleteDocuments(uidIter.term()); uidIter.next(); }
             */
            deleting = false;
        }

        reader.close(); // close existing index

    }

    else {
        indexDocs(file);
    }

}

From source file:narock.HighFreqTerms.java

License:Apache License

/**
 * /* w w  w.  java2  s . c o m*/
 * @param reader
 * @param numTerms
 * @param field
 * @return TermStats[] ordered by terms with highest docFreq first.
 * @throws Exception
 */
public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames)
        throws Exception {
    TermStatsQueue tiq = null;
    TermsEnum te = null;

    if (fieldNames != null) {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            LOG.info("Index with no fields - probably empty or corrupted");
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        for (String field : fieldNames) {
            Terms terms = fields.terms(field);
            if (terms != null) {
                te = terms.iterator(te);
                fillQueue(te, tiq, field);
            }
        }
    } else {
        Fields fields = MultiFields.getFields(reader);
        if (fields == null) {
            LOG.info("Index with no fields - probably empty or corrupted");
            return EMPTY_STATS;
        }
        tiq = new TermStatsQueue(numTerms);
        // FieldsEnum fieldsEnum = fields.iterator();
        while (true) {
            String field = fields.iterator().next();
            //fieldsEnum.next();
            if (field != null) {
                Terms terms = fields.terms(field);
                //fieldsEnum.terms();
                te = terms.iterator(te);
                fillQueue(te, tiq, field);
            } else {
                break;
            }
        }
    }

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
        result[count] = tiq.pop();
        count--;
    }
    return result;
}

From source file:org.elasticsearch.action.termlist.TransportTermlistAction.java

License:Apache License

@Override
protected ShardTermlistResponse shardOperation(ShardTermlistRequest request) throws ElasticSearchException {
    synchronized (termlistMutex) {
        InternalIndexShard indexShard = (InternalIndexShard) indicesService.indexServiceSafe(request.index())
                .shardSafe(request.shardId());
        indexShard.store().directory();//w  ww .j  a  va2  s  .  c o  m
        Engine.Searcher searcher = indexShard.searcher();
        try {
            Set<String> set = new CompactHashSet();

            Fields fields = MultiFields.getFields(searcher.reader());
            if (fields != null) {
                for (Iterator<String> it = fields.iterator(); it.hasNext();) {
                    String field = it.next();
                    if (field.charAt(0) == '_') {
                        continue;
                    }
                    if (request.getField() == null || field.equals(request.getField())) {
                        Terms terms = fields.terms(field);
                        if (terms != null) {
                            TermsEnum termsEnum = terms.iterator(null);
                            BytesRef text;
                            while ((text = termsEnum.next()) != null) {
                                set.add(text.utf8ToString());
                                System.out.println("field=" + field + "; text=" + text.utf8ToString());
                            }
                        }
                    }
                }
            }
            return new ShardTermlistResponse(request.index(), request.shardId(), set);
        } catch (IOException ex) {
            throw new ElasticSearchException(ex.getMessage(), ex);
        }
    }
}

From source file:org.elasticsearch.action.termvector.TermVectorResponse.java

License:Apache License

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
    assert index != null;
    assert type != null;
    assert id != null;
    builder.startObject();/*from w w  w  . ja va  2 s  .com*/
    builder.field(FieldStrings._INDEX, index);
    builder.field(FieldStrings._TYPE, type);
    builder.field(FieldStrings._ID, id);
    builder.field(FieldStrings._VERSION, docVersion);
    builder.field(FieldStrings.FOUND, isExists());
    if (!isExists()) {
        builder.endObject();
        return builder;
    }
    builder.startObject(FieldStrings.TERM_VECTORS);
    final CharsRef spare = new CharsRef();
    Fields theFields = getFields();
    Iterator<String> fieldIter = theFields.iterator();
    while (fieldIter.hasNext()) {
        buildField(builder, spare, theFields, fieldIter);
    }
    builder.endObject();
    builder.endObject();
    return builder;

}

From source file:org.elasticsearch.action.termvectors.TermVectorsResponse.java

License:Apache License

@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
    assert index != null;
    assert type != null;
    assert id != null;
    builder.field(FieldStrings._INDEX, index);
    builder.field(FieldStrings._TYPE, type);
    if (!isArtificial()) {
        builder.field(FieldStrings._ID, id);
    }/*from   w ww . j a va  2 s. c om*/
    builder.field(FieldStrings._VERSION, docVersion);
    builder.field(FieldStrings.FOUND, isExists());
    builder.field(FieldStrings.TOOK, tookInMillis);
    if (!isExists()) {
        return builder;
    }
    builder.startObject(FieldStrings.TERM_VECTORS);
    final CharsRefBuilder spare = new CharsRefBuilder();
    Fields theFields = getFields();
    Iterator<String> fieldIter = theFields.iterator();
    while (fieldIter.hasNext()) {
        buildField(builder, spare, theFields, fieldIter);
    }
    builder.endObject();
    return builder;
}