Example usage for org.apache.lucene.document FieldType freeze

List of usage examples for org.apache.lucene.document FieldType freeze

Introduction

In this page you can find the example usage for org.apache.lucene.document FieldType freeze.

Prototype

public void freeze() 

Source Link

Document

Prevents future changes.

Usage

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.utils.NewsItemLuceneDocConverter.java

License:Apache License

private static FieldType getTextType() {
    FieldType ftype = new FieldType();
    ftype.setIndexed(true);//  w w w.ja  va  2 s. c o  m
    ftype.setStoreTermVectors(true);
    ftype.setStored(true);
    ftype.freeze();
    return ftype;
}

From source file:com.epam.catgenome.dao.index.FeatureIndexDao.java

License:Open Source License

private void addCommonDocumentFields(Document document, FeatureIndexEntry entry, final Long featureFileId) {
    document.add(new SortedStringField(FeatureIndexFields.FEATURE_ID.getFieldName(), entry.getFeatureId()));

    FieldType fieldType = new FieldType();
    fieldType.setOmitNorms(true);//from  ww w. j  a  v a2  s  . c om
    fieldType.setIndexOptions(IndexOptions.DOCS);
    fieldType.setStored(true);
    fieldType.setTokenized(false);
    fieldType.setDocValuesType(DocValuesType.SORTED);
    fieldType.freeze();
    Field field = new Field(FeatureIndexFields.CHROMOSOME_ID.getFieldName(),
            entry.getChromosome() != null ? new BytesRef(entry.getChromosome().getId().toString())
                    : new BytesRef(""),
            fieldType);
    document.add(field);
    document.add(new SortedStringField(FeatureIndexFields.CHROMOSOME_NAME.getFieldName(),
            entry.getChromosome().getName(), true));

    document.add(new SortedIntPoint(FeatureIndexFields.START_INDEX.getFieldName(), entry.getStartIndex()));
    document.add(new StoredField(FeatureIndexFields.START_INDEX.getFieldName(), entry.getStartIndex()));
    document.add(new SortedDocValuesField(FeatureIndexFields.START_INDEX.getGroupName(),
            new BytesRef(entry.getStartIndex().toString())));

    document.add(new SortedIntPoint(FeatureIndexFields.END_INDEX.getFieldName(), entry.getEndIndex()));
    document.add(new StoredField(FeatureIndexFields.END_INDEX.getFieldName(), entry.getEndIndex()));
    document.add(new SortedDocValuesField(FeatureIndexFields.END_INDEX.getGroupName(),
            new BytesRef(entry.getStartIndex().toString())));

    document.add(new StringField(FeatureIndexFields.FEATURE_TYPE.getFieldName(),
            entry.getFeatureType() != null ? entry.getFeatureType().getFileValue() : "", Field.Store.YES));
    document.add(new StringField(FeatureIndexFields.FILE_ID.getFieldName(), featureFileId.toString(),
            Field.Store.YES));

    document.add(new StringField(FeatureIndexFields.FEATURE_NAME.getFieldName(),
            entry.getFeatureName() != null ? entry.getFeatureName().toLowerCase() : "", Field.Store.YES));
    document.add(new SortedDocValuesField(FeatureIndexFields.FEATURE_NAME.getFieldName(),
            new BytesRef(entry.getFeatureName() != null ? entry.getFeatureName() : "")));

    document.add(new SortedSetDocValuesFacetField(FeatureIndexFields.CHR_ID.getFieldName(),
            entry.getChromosome().getId().toString()));

    document.add(new SortedStringField(FeatureIndexFields.UID.getFieldName(), entry.getUuid().toString()));
    document.add(new SortedSetDocValuesFacetField(FeatureIndexFields.F_UID.getFieldName(),
            entry.getUuid().toString()));
}

From source file:com.github.hotware.lucene.extension.bean.field.BeanInformationCacheImpl.java

License:BEER-WARE LICENSE

private FieldInformation buildFieldInformation(BeanField bf, Field field, Class<?> fieldClass) {
    com.github.hotware.lucene.extension.bean.type.Type typeWrapper;
    try {//from   w ww  .  jav  a2 s . com
        // TODO: maybe cache these?
        typeWrapper = (com.github.hotware.lucene.extension.bean.type.Type) bf.type().newInstance();
    } catch (InstantiationException | IllegalAccessException e) {
        throw new RuntimeException(e);
    }
    FieldType fieldType = new FieldType();
    fieldType.setIndexed(bf.index());
    fieldType.setStored(bf.store());
    fieldType.setTokenized(bf.tokenized());
    fieldType.setStoreTermVectors(bf.storeTermVectors());
    fieldType.setStoreTermVectorPositions(bf.storeTermVectorPositions());
    fieldType.setStoreTermVectorOffsets(bf.storeTermVectorOffsets());
    fieldType.setStoreTermVectorPayloads(bf.storeTermVectorPayloads());
    fieldType.setOmitNorms(bf.omitNorms());
    fieldType.setIndexOptions(bf.indexOptions());
    typeWrapper.configureFieldType(fieldType);
    fieldType.freeze();
    return new FieldInformation(new FrozenField(field), fieldClass, fieldType, bf);
}

From source file:com.o19s.solr.swan.highlight.TermVectorFun.java

License:Apache License

@Test
public void testBlah() throws IOException {
    RAMDirectory ramDir = new RAMDirectory();
    // Index some made up content
    IndexWriterConfig iwf = new IndexWriterConfig(Version.LUCENE_47, new StandardAnalyzer(Version.LUCENE_47));
    IndexWriter writer = new IndexWriter(ramDir, iwf);
    FieldType ft = new FieldType();
    ft.setIndexed(true);/*from   w  w w .  j  av a 2s.  c o  m*/
    ft.setTokenized(true);
    ft.setStored(true);
    ft.setStoreTermVectorOffsets(true);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(true);
    ft.freeze();
    for (int i = 0; i < DOCS.length; i++) {
        Document doc = new Document();
        StringField id = new StringField("id", "doc_" + i, StringField.Store.YES);
        doc.add(id);
        // Store both position and offset information
        Field text = new Field("content", DOCS[i], ft);
        //               Field.Index.ANALYZED,
        //               Field.TermVector.WITH_POSITIONS_OFFSETS);
        doc.add(text);
        writer.addDocument(doc);
    }
    //writer.close();
    // Get a searcher
    AtomicReader dr = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(writer, true));
    IndexSearcher searcher = new IndexSearcher(dr);
    // Do a search using SpanQuery
    SpanTermQuery fleeceQ = new SpanTermQuery(new Term("content", "fleece"));
    TopDocs results = searcher.search(fleeceQ, 10);
    for (int i = 0; i < results.scoreDocs.length; i++) {
        ScoreDoc scoreDoc = results.scoreDocs[i];
        System.out.println("Score Doc: " + scoreDoc);
    }
    IndexReader reader = searcher.getIndexReader();
    Bits acceptDocs = null;
    Map<Term, TermContext> termContexts = new HashMap<Term, TermContext>();
    Spans spans = fleeceQ.getSpans(dr.getContext(), acceptDocs, termContexts);

    while (spans.next()) {
        System.out.println("Doc: " + spans.doc() + " Start: " + spans.start() + " End: " + spans.end());
        DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor("content");
        reader.document(spans.doc(), visitor);
        Terms terms = reader.getTermVector(spans.doc(), "content");
        TermsEnum tenum = terms.iterator(null);
        //         AttributeSource as = tenum.attributes();

        while (tenum.next() != null) {
            System.out.println(tenum.term().utf8ToString());
        }
        for (long pos = 0L; pos < spans.end(); pos++) {
            //            tenum.next();
            //            if (tenum.ord()<pos) continue;
            //            System.out.println(tenum.term());
            //            
        }

        reader.document(spans.doc(), visitor);
        //         String[] values = visitor.getDocument().getValues("content");
        //         List<String> a = new ArrayList<String>();
        //         // build up the window
        //         tvm.start = spans.start() - window;
        //         tvm.end = spans.end() + window;
        //         reader.getTermFreqVector(spans.doc(), "content", tvm);
        //         for (WindowEntry entry : tvm.entries.values()) {
        //            System.out.println("Entry: " + entry);
        //         }
        //         // clear out the entries for the next round
        //         tvm.entries.clear();
    }
}

From source file:edu.umass.cs.ciir.IndexFromGalago.java

License:Open Source License

public static void main(String[] args) throws Exception {
    Parameters argp = Parameters.parseArgs(args);
    String galagoIndexPath = null;
    String luceneIndexPath = null;
    try {/*from   ww w. j a va  2  s .  c om*/
        galagoIndexPath = argp.getString("galagoIndex");
        luceneIndexPath = argp.getString("luceneIndex");
    } catch (Exception e) {
        System.out.println(getUsage());
        return;
    }

    logger.setUseParentHandlers(false);
    FileHandler lfh = new FileHandler("indexing-errors.log");
    SimpleFormatter formatter = new SimpleFormatter();
    lfh.setFormatter(formatter);
    logger.addHandler(lfh);

    final DiskIndex index = new DiskIndex(argp.get("index", galagoIndexPath));
    final CorpusReader corpus = (CorpusReader) index.getIndexPart("corpus");
    long total = corpus.getManifest().getLong("keyCount");
    final CorpusReader.KeyIterator iterator = corpus.getIterator();

    final Document.DocumentComponents dcp = Document.DocumentComponents.JustText;
    // Analyzer includes options for text processing
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            // Step 1: tokenization (Lucene's StandardTokenizer is suitable for most text retrieval occasions)
            TokenStreamComponents ts = new TokenStreamComponents(new StandardTokenizer());
            // Step 2: transforming all tokens into lowercased ones
            ts = new Analyzer.TokenStreamComponents(ts.getTokenizer(),
                    new LowerCaseFilter(ts.getTokenStream()));
            // Step 3: whether to remove stop words
            // Uncomment the following line to remove stop words
            // ts = new TokenStreamComponents( ts.getTokenizer(), new StopwordsFilter( ts.getTokenStream(), StandardAnalyzer.ENGLISH_STOP_WORDS_SET ) );
            // Step 4: whether to apply stemming
            // Uncomment the following line to apply Krovetz or Porter stemmer
            // ts = new TokenStreamComponents( ts.getTokenizer(), new KStemFilter( ts.getTokenStream() ) );
            // ts = new TokenStreamComponents( ts.getTokenizer(), new PorterStemFilter( ts.getTokenStream() ) );
            return ts;
        }
    };

    try (final FSDirectory dir = FSDirectory.open(Paths.get(argp.get("output", luceneIndexPath)))) {
        final IndexWriterConfig cfg = new IndexWriterConfig(analyzer);
        System.out.println("Similarity: " + cfg.getSimilarity());
        cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        try (IndexWriter writer = new IndexWriter(dir, cfg)) {
            iterator.forAllKeyStrings(docId -> {
                try {
                    Document document = iterator.getDocument(dcp);

                    String text = document.text;
                    String id = document.name;
                    System.out.println("Processing document: " + id);
                    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
                    doc.add(new StringField("id", id, Field.Store.YES));
                    // this stores the actual text with tags so formatting is preserved
                    doc.add(new StoredField("body", text));
                    org.jsoup.nodes.Document jsoup = Jsoup.parse(text);

                    // tokens of the document
                    FieldType fieldTypeText = new FieldType();
                    fieldTypeText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
                    fieldTypeText.setStoreTermVectors(true);
                    fieldTypeText.setStoreTermVectorPositions(true);
                    fieldTypeText.setTokenized(true);
                    fieldTypeText.setStored(false);
                    fieldTypeText.freeze();
                    doc.add(new Field("tokens", jsoup.text(), fieldTypeText));

                    try {
                        writer.addDocument(doc);
                        System.out.println("Doc count: " + writer.numDocs());
                    } catch (IOException e) {
                        logger.log(Level.WARNING, "Pull-Document-Exception", e);
                        System.err.println(e.toString());
                    }

                } catch (Exception e) {
                    logger.log(Level.WARNING, "Pull-Document-Exception", e);
                    System.err.println(e.toString());
                }
            });

        }
    }

    System.out.println("Indexing Done. ");
}

From source file:org.aksw.palmetto.corpus.lucene.creation.PositionStoringLuceneIndexCreator.java

License:Open Source License

/**
 * Creates the index./* ww  w. j av  a 2  s.c om*/
 * 
 * @param indexPath
 *            The path to the director in which the Lucene index will be created
 * @param docIterator
 *            Iterator that iterates over the document texts.
 * @return true if the creation was successful, else false.
 */
public boolean createIndex(File indexPath, Iterator<IndexableDocument> docIterator) {
    LOGGER.info("Starting index creation...");
    IndexWriter writer = null;
    indexPath.mkdirs();
    Analyzer analyzer = new SimpleAnalyzer(true);
    try {
        IndexWriterConfig config = new IndexWriterConfig(version, analyzer);
        config.setOpenMode(OpenMode.CREATE);

        FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
        fieldType.setIndexed(true);
        fieldType.setStoreTermVectors(true);
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
        fieldType.freeze();

        FieldType docLengthFieldType = new FieldType(IntField.TYPE_STORED);
        docLengthFieldType.setIndexed(false);
        docLengthFieldType.freeze();

        writer = new IndexWriter(FSDirectory.open(indexPath), config);
        int count = 0;
        Document indexDocument;
        IndexableDocument currentDocument;
        while (docIterator.hasNext()) {
            currentDocument = docIterator.next();
            if (currentDocument.getText().length() > 0) {
                indexDocument = toLuceneDocument(analyzer, currentDocument.getText(), fieldType);
                addDocumentLength(indexDocument, docLengthFieldName, docLengthFieldType,
                        currentDocument.getNumberOfTokens());
                writer.addDocument(indexDocument);
                ++count;
                if (count >= commitInterval) {
                    writer.commit();
                    System.gc();
                    count = 0;
                }
            }
        }
        LOGGER.info("Finished index creation.");
    } catch (IOException e) {
        LOGGER.error("Error while creating Index. Aborting.", e);
        return false;
    } finally {
        if (writer != null) {
            try {
                writer.close();
            } catch (Exception e) {
            }
        }
    }
    return true;
}

From source file:org.aksw.palmetto.corpus.lucene.creation.SimpleLuceneIndexCreator.java

License:Open Source License

/**
 * Creates the index.//from   w  w  w.  j a v a2 s  . c  o  m
 * 
 * @param indexPath
 *            The path to the director in which the Lucene index will be created
 * @param docIterator
 *            Iterator that iterates over the document texts.
 * @return true if the creation was successful, else false.
 */
public boolean createIndex(File indexPath, Iterator<String> docIterator) {
    LOGGER.info("Starting index creation...");
    IndexWriter writer = null;
    indexPath.mkdirs();
    Analyzer analyzer = new SimpleAnalyzer(true);
    try {
        IndexWriterConfig config = new IndexWriterConfig(version, analyzer);
        config.setOpenMode(OpenMode.CREATE);

        FieldType fieldType = new FieldType(TextField.TYPE_NOT_STORED);
        fieldType.setIndexed(true);
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        fieldType.freeze();

        writer = new IndexWriter(FSDirectory.open(indexPath), config);
        String text;
        int count = 0;
        while (docIterator.hasNext()) {
            text = docIterator.next();
            if (text.length() > 0) {
                writer.addDocument(toLuceneDocument(analyzer, text, fieldType));
                ++count;
                if (count >= commitInterval) {
                    writer.commit();
                    System.gc();
                    count = 0;
                }
            } else {
                LOGGER.warn("Got a document without content.");
            }
        }
        LOGGER.info("Finished index creation.");
    } catch (IOException e) {
        LOGGER.error("Error while creating Index. Aborting.", e);
        return false;
    } finally {
        if (writer != null) {
            try {
                writer.close();
            } catch (Exception e) {
            }
        }
    }
    return true;
}

From source file:org.apache.camel.component.lucene.LuceneIndexer.java

License:Apache License

private static FieldType createFieldType(boolean tokenized) {
    FieldType answer = new FieldType();
    answer.setIndexed(true);/*  ww w. ja va  2s  .  co m*/
    answer.setStored(true);
    answer.setTokenized(tokenized);

    // freeze the answer so that it becomes immutable
    answer.freeze();

    return answer;
}

From source file:org.apache.mahout.clustering.TestClusterDumper.java

License:Apache License

private void getSampleData(String[] docs2) throws IOException {
    sampleData = Lists.newArrayList();/*w  ww  .jav a 2  s.c  om*/
    RAMDirectory directory = new RAMDirectory();

    IndexWriter writer = new IndexWriter(directory,
            new IndexWriterConfig(Version.LUCENE_46, new StandardAnalyzer(Version.LUCENE_46)));

    try {
        for (int i = 0; i < docs2.length; i++) {
            Document doc = new Document();
            Field id = new StringField("id", "doc_" + i, Field.Store.YES);
            doc.add(id);
            // Store both position and offset information
            FieldType fieldType = new FieldType();
            fieldType.setStored(false);
            fieldType.setIndexed(true);
            fieldType.setTokenized(true);
            fieldType.setStoreTermVectors(true);
            fieldType.setStoreTermVectorPositions(true);
            fieldType.setStoreTermVectorOffsets(true);
            fieldType.freeze();
            Field text = new Field("content", docs2[i], fieldType);
            doc.add(text);
            writer.addDocument(doc);
        }
    } finally {
        Closeables.close(writer, false);
    }

    IndexReader reader = DirectoryReader.open(directory);

    Weight weight = new TFIDF();
    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);

    int numTerms = 0;
    for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) {
        it.next();
        numTerms++;
    }
    termDictionary = new String[numTerms];
    int i = 0;
    for (Iterator<TermEntry> it = termInfo.getAllEntries(); it.hasNext();) {
        String term = it.next().getTerm();
        termDictionary[i] = term;
        System.out.println(i + " " + term);
        i++;
    }
    Iterable<Vector> iterable = new LuceneIterable(reader, "id", "content", termInfo, weight);

    i = 0;
    for (Vector vector : iterable) {
        assertNotNull(vector);
        NamedVector namedVector;
        if (vector instanceof NamedVector) {
            // rename it for testing purposes
            namedVector = new NamedVector(((NamedVector) vector).getDelegate(), "P(" + i + ')');

        } else {
            namedVector = new NamedVector(vector, "P(" + i + ')');
        }
        System.out.println(AbstractCluster.formatVector(namedVector, termDictionary));
        sampleData.add(new VectorWritable(namedVector));
        i++;
    }
}

From source file:org.apache.mahout.utils.vectors.lucene.CachedTermInfoTest.java

License:Apache License

@Before
public void before() throws IOException {
    directory = new RAMDirectory();

    FieldType fieldType = new FieldType();
    fieldType.setStored(false);//from   w  ww  . j a v a  2s.c o  m
    fieldType.setIndexed(true);
    fieldType.setTokenized(true);
    fieldType.setStoreTermVectors(false);
    fieldType.setStoreTermVectorPositions(false);
    fieldType.setStoreTermVectorOffsets(false);
    fieldType.freeze();

    directory = createTestIndex(fieldType, directory, 0);
}