Example usage for org.apache.lucene.document FieldType setStoreTermVectorPositions

List of usage examples for org.apache.lucene.document FieldType setStoreTermVectorPositions

Introduction

In this page you can find the example usage for org.apache.lucene.document FieldType setStoreTermVectorPositions.

Prototype

public void setStoreTermVectorPositions(boolean value) 

Source Link

Document

Set to true to also store token positions into the term vector for this field.

Usage

From source file:alix.lucene.Alix.java

License:Open Source License

/**
 * Parse field type String/* ww  w .  j  a  va2 s . c om*/
 * 
 * @param name Name of the field
 * @param value Value of the field
 * @param options a string composed of letters in any order following Luke convention to describe fields
 * IdfpoPSV
 * I: Indexed
 * d: docs
 * f: freqs
 * p: pos
 * o: offset
 * P: payloads
 * S: Stored
 * V: TermVector
 */
public static FieldType fieldType(String options) {
    FieldType type;
    if (options == null)
        return new FieldType();
    if ("S".equals(options)) {
        type = new FieldType();
        type.setStored(true);
        return type;
    }
    if (options.contains("S")) {
        type = new FieldType(TextField.TYPE_STORED);
    } else {
        type = new FieldType(TextField.TYPE_NOT_STORED);
    }
    // optimize ?
    type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    if (options.contains("p")) {
        type.setStoreTermVectorPositions(true);
    }

    if (options.contains("o")) {
        type.setTokenized(true);
        type.setStoreTermVectors(true);
        type.setStoreTermVectorOffsets(true);
    }
    if (options.contains("P")) {
        type.setTokenized(true);
        type.setStoreTermVectors(true);
        type.setStoreTermVectorPositions(true);
        type.setStoreTermVectorPayloads(true);
    }
    if (options.contains("V")) {
        type.setTokenized(true);
        type.setStoreTermVectors(true);
    }
    return type;
}

From source file:api.startup.PDFIndexer.java

License:Open Source License

/**
 * Indexes a single document and writes it to the given index writer
 * @param writer - the index writer to writer
 * @param metadata - the document//from w  w w  . j  ava2s .  co  m
 * @throws IOException
 */
static void indexDoc(IndexWriter writer, DocumentMetadata metadata) throws IOException {
    Path file = Paths.get(metadata.getFilename());
    try {
        Document doc = new Document();

        Field pathField = new StringField(Constants.FIELD_PATH, file.toString(), Field.Store.YES);
        doc.add(pathField);

        // Add Document metadata //
        doc.add(new StringField(Constants.FIELD_AUTHOR, metadata.getAuthor(), Field.Store.YES));
        doc.add(new StringField(Constants.FIELD_TITLE, metadata.getTitle(), Field.Store.YES));
        doc.add(new StringField(Constants.FIELD_CONFERENCE, metadata.getConference(), Field.Store.YES));
        // End of Document Metadata //

        Field modified = new LongField(Constants.FIELD_MODIFIED, Files.getLastModifiedTime(file).toMillis(),
                Field.Store.YES);
        doc.add(modified);

        PDFTextExtractor extractor = new PDFTextExtractor();
        // Get the string contents
        String textContents = extractor.extractText(file.toString());

        // Store the string contents
        FieldType contentsType = new FieldType();
        contentsType.setStored(true);
        contentsType.setTokenized(true);
        contentsType.setStoreTermVectors(true);
        contentsType.setStoreTermVectorPositions(true);
        contentsType.setStoreTermVectorPayloads(true);
        contentsType.setStoreTermVectorOffsets(true);
        contentsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        Field contents = new Field(Constants.FIELD_CONTENTS, textContents, contentsType);
        doc.add(contents);

        if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            log.info("adding " + file + " to index");
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so
            // we use updateDocument instead to replace the old one matching the exact
            // path, if present:
            log.info("updating " + file + " in index");
            writer.updateDocument(new Term(Constants.FIELD_PATH, file.toString()), doc);
        }
    } catch (IOException e) {
        log.error("Failed to read file " + metadata.getFilename());
    }

}

From source file:ci6226.buildindex.java

/**
 * @param args the command line arguments
 *//*from  w ww .j ava 2s  .  c  om*/
public static void main(String[] args) throws FileNotFoundException, IOException, ParseException {
    String file = "/home/steven/Dropbox/workspace/ntu_coursework/ci6226/Assiment/yelpdata/yelp_training_set/yelp_training_set_review.json";
    JSONParser parser = new JSONParser();

    BufferedReader in = new BufferedReader(new FileReader(file));
    //  List<Document> jdocs = new LinkedList<Document>();
    Date start = new Date();
    String indexPath = "./myindex";
    System.out.println("Indexing to directory '" + indexPath + "'...");
    // Analyzer analyzer= new NGramAnalyzer(2,8);
    Analyzer analyzer = new myAnalyzer();

    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer);
    Directory dir = FSDirectory.open(new File(indexPath));
    // :Post-Release-Update-Version.LUCENE_XY:
    // TODO: try different analyzer,stop words,words steming check size
    //   Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);

    // Add new documents to an existing index:
    // iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
    iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
    // Optional: for better indexing performance, if you
    // are indexing many documents, increase the RAM
    // buffer.  But if you do this, increase the max heap
    // size to the JVM (eg add -Xmx512m or -Xmx1g):
    //
    // iwc.setRAMBufferSizeMB(256.0);
    IndexWriter writer = new IndexWriter(dir, iwc);
    //  writer.addDocuments(jdocs);
    int line = 0;
    while (in.ready()) {
        String s = in.readLine();
        Object obj = JSONValue.parse(s);
        JSONObject person = (JSONObject) obj;
        String text = (String) person.get("text");
        String user_id = (String) person.get("user_id");
        String business_id = (String) person.get("business_id");
        String review_id = (String) person.get("review_id");
        JSONObject votes = (JSONObject) person.get("votes");
        long funny = (Long) votes.get("funny");
        long cool = (Long) votes.get("cool");
        long useful = (Long) votes.get("useful");
        Document doc = new Document();
        Field review_idf = new StringField("review_id", review_id, Field.Store.YES);
        doc.add(review_idf);
        Field business_idf = new StringField("business_id", business_id, Field.Store.YES);
        doc.add(business_idf);

        //http://qindongliang1922.iteye.com/blog/2030639
        FieldType ft = new FieldType();
        ft.setIndexed(true);//  
        ft.setStored(true);//  
        ft.setStoreTermVectors(true);
        ft.setTokenized(true);
        ft.setStoreTermVectorPositions(true);//?  
        ft.setStoreTermVectorOffsets(true);//???  

        Field textf = new Field("text", text, ft);

        doc.add(textf);
        //    Field user_idf = new StringField("user_id", user_id, Field.Store.YES);
        //     doc.add(user_idf);
        //      doc.add(new LongField("cool", cool, Field.Store.YES));
        //      doc.add(new LongField("funny", funny, Field.Store.YES));
        //       doc.add(new LongField("useful", useful, Field.Store.YES));

        writer.addDocument(doc);

        System.out.println(line++);
    }

    writer.close();
    Date end = new Date();
    System.out.println(end.getTime() - start.getTime() + " total milliseconds");
    // BufferedReader in = new BufferedReader(new FileReader(file));
    //while (in.ready()) {
    //  String s = in.readLine();
    //  //System.out.println(s);
    // JSONObject jsonObject = (JSONObject) ((Object)s);
    //      String rtext = (String) jsonObject.get("text");
    //      System.out.println(rtext);
    //      //long age = (Long) jsonObject.get("age");
    //      //System.out.println(age);
    //}
    //in.close();
}

From source file:ci6226.eval_index_writer.java

public eval_index_writer(Analyzer _analyzer, String _iReviewLocation, String _dir) throws IOException {
    String file = _iReviewLocation;
    JSONParser parser = new JSONParser();
    BufferedReader in = new BufferedReader(new FileReader(file));
    Date start = new Date();
    String indexPath = "./" + _dir;
    System.out.println("Indexing to directory '" + indexPath + "'...");
    Analyzer analyzer = _analyzer;//from www.  j  ava  2 s  .  com
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer);
    Directory dir = FSDirectory.open(new File(indexPath));
    iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
    IndexWriter writer = new IndexWriter(dir, iwc);
    //  int line=0;
    while (in.ready()) {
        String s = in.readLine();
        Object obj = JSONValue.parse(s);
        JSONObject person = (JSONObject) obj;
        String text = (String) person.get("text");
        String user_id = (String) person.get("user_id");
        String business_id = (String) person.get("business_id");
        String review_id = (String) person.get("review_id");
        JSONObject votes = (JSONObject) person.get("votes");
        long funny = (Long) votes.get("funny");
        long cool = (Long) votes.get("cool");
        long useful = (Long) votes.get("useful");
        Document doc = new Document();
        Field review_idf = new StringField("review_id", review_id, Field.Store.YES);
        doc.add(review_idf);
        //    Field business_idf = new StringField("business_id", business_id, Field.Store.YES);
        //     doc.add(business_idf);

        //http://qindongliang1922.iteye.com/blog/2030639
        FieldType ft = new FieldType();
        ft.setIndexed(true);//
        ft.setStored(true);//
        ft.setStoreTermVectors(true);
        ft.setTokenized(true);
        ft.setStoreTermVectorPositions(true);//
        ft.setStoreTermVectorOffsets(true);//

        Field textf = new Field("text", text, ft);

        doc.add(textf);
        //    Field user_idf = new StringField("user_id", user_id, Field.Store.YES);
        //     doc.add(user_idf);
        //      doc.add(new LongField("cool", cool, Field.Store.YES));
        //      doc.add(new LongField("funny", funny, Field.Store.YES));
        //       doc.add(new LongField("useful", useful, Field.Store.YES));

        writer.addDocument(doc);

        //  System.out.println(line++);
    }

    writer.close();
    Date end = new Date();
    System.out.println(end.getTime() - start.getTime() + " total milliseconds");
}

From source file:com.github.hotware.lucene.extension.bean.field.BeanInformationCacheImpl.java

License:BEER-WARE LICENSE

private FieldInformation buildFieldInformation(BeanField bf, Field field, Class<?> fieldClass) {
    com.github.hotware.lucene.extension.bean.type.Type typeWrapper;
    try {/*from w w  w  .j  ava  2  s  .c o m*/
        // TODO: maybe cache these?
        typeWrapper = (com.github.hotware.lucene.extension.bean.type.Type) bf.type().newInstance();
    } catch (InstantiationException | IllegalAccessException e) {
        throw new RuntimeException(e);
    }
    FieldType fieldType = new FieldType();
    fieldType.setIndexed(bf.index());
    fieldType.setStored(bf.store());
    fieldType.setTokenized(bf.tokenized());
    fieldType.setStoreTermVectors(bf.storeTermVectors());
    fieldType.setStoreTermVectorPositions(bf.storeTermVectorPositions());
    fieldType.setStoreTermVectorOffsets(bf.storeTermVectorOffsets());
    fieldType.setStoreTermVectorPayloads(bf.storeTermVectorPayloads());
    fieldType.setOmitNorms(bf.omitNorms());
    fieldType.setIndexOptions(bf.indexOptions());
    typeWrapper.configureFieldType(fieldType);
    fieldType.freeze();
    return new FieldInformation(new FrozenField(field), fieldClass, fieldType, bf);
}

From source file:com.meizu.nlp.classification.ClassificationTestBase.java

License:Apache License

private void populatePerformanceIndex(Analyzer analyzer) throws IOException {
    indexWriter.close();//from   w  w  w .j a  v a 2  s.c  om
    indexWriter = new RandomIndexWriter(random(), dir,
            newIndexWriterConfig(analyzer).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
    indexWriter.commit();

    FieldType ft = new FieldType(TextField.TYPE_STORED);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorOffsets(true);
    ft.setStoreTermVectorPositions(true);
    int docs = 1000;
    Random random = random();
    for (int i = 0; i < docs; i++) {
        boolean b = random.nextBoolean();
        Document doc = new Document();
        doc.add(new Field(textFieldName, createRandomString(random), ft));
        doc.add(new Field(categoryFieldName, b ? "technology" : "politics", ft));
        doc.add(new Field(booleanFieldName, String.valueOf(b), ft));
        indexWriter.addDocument(doc);
    }
    indexWriter.commit();
}

From source file:com.meizu.nlp.classification.utils.DatasetSplitter.java

License:Apache License

/**
 * Split a given index into 3 indexes for training, test and cross validation tasks respectively
 *
 * @param originalIndex        an {@link org.apache.lucene.index.LeafReader} on the source index
 * @param trainingIndex        a {@link Directory} used to write the training index
 * @param testIndex            a {@link Directory} used to write the test index
 * @param crossValidationIndex a {@link Directory} used to write the cross validation index
 * @param analyzer             {@link Analyzer} used to create the new docs
 * @param fieldNames           names of fields that need to be put in the new indexes or <code>null</code> if all should be used
 * @throws IOException if any writing operation fails on any of the indexes
 *///from  w  ww . jav  a2 s .  co m
public void split(LeafReader originalIndex, Directory trainingIndex, Directory testIndex,
        Directory crossValidationIndex, Analyzer analyzer, String... fieldNames) throws IOException {

    // create IWs for train / test / cv IDXs
    IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(analyzer));
    IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(analyzer));
    IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(analyzer));

    try {
        int size = originalIndex.maxDoc();

        IndexSearcher indexSearcher = new IndexSearcher(originalIndex);
        TopDocs topDocs = indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE);

        // set the type to be indexed, stored, with term vectors
        FieldType ft = new FieldType(TextField.TYPE_STORED);
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorOffsets(true);
        ft.setStoreTermVectorPositions(true);

        int b = 0;

        // iterate over existing documents
        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {

            // create a new document for indexing
            Document doc = new Document();
            if (fieldNames != null && fieldNames.length > 0) {
                for (String fieldName : fieldNames) {
                    doc.add(new Field(fieldName,
                            originalIndex.document(scoreDoc.doc).getField(fieldName).stringValue(), ft));
                }
            } else {
                for (IndexableField storableField : originalIndex.document(scoreDoc.doc).getFields()) {
                    if (storableField.readerValue() != null) {
                        doc.add(new Field(storableField.name(), storableField.readerValue(), ft));
                    } else if (storableField.binaryValue() != null) {
                        doc.add(new Field(storableField.name(), storableField.binaryValue(), ft));
                    } else if (storableField.stringValue() != null) {
                        doc.add(new Field(storableField.name(), storableField.stringValue(), ft));
                    } else if (storableField.numericValue() != null) {
                        doc.add(new Field(storableField.name(), storableField.numericValue().toString(), ft));
                    }
                }
            }

            // add it to one of the IDXs
            if (b % 2 == 0 && testWriter.maxDoc() < size * testRatio) {
                testWriter.addDocument(doc);
            } else if (cvWriter.maxDoc() < size * crossValidationRatio) {
                cvWriter.addDocument(doc);
            } else {
                trainingWriter.addDocument(doc);
            }
            b++;
        }
    } catch (Exception e) {
        throw new IOException(e);
    } finally {
        testWriter.commit();
        cvWriter.commit();
        trainingWriter.commit();
        // close IWs
        testWriter.close();
        cvWriter.close();
        trainingWriter.close();
    }
}

From source file:com.meizu.nlp.classification.utils.DataSplitterTest.java

License:Apache License

@Override
@Before/*from   www  . jav a 2s  . com*/
public void setUp() throws Exception {
    super.setUp();
    dir = newDirectory();
    indexWriter = new RandomIndexWriter(random(), dir);

    FieldType ft = new FieldType(TextField.TYPE_STORED);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorOffsets(true);
    ft.setStoreTermVectorPositions(true);

    Document doc;
    Random rnd = random();
    for (int i = 0; i < 100; i++) {
        doc = new Document();
        doc.add(new Field(idFieldName, Integer.toString(i), ft));
        doc.add(new Field(textFieldName, TestUtil.randomUnicodeString(rnd, 1024), ft));
        doc.add(new Field(classFieldName, TestUtil.randomUnicodeString(rnd, 10), ft));
        indexWriter.addDocument(doc);
    }

    indexWriter.commit();

    originalIndex = SlowCompositeReaderWrapper.wrap(indexWriter.getReader());

}

From source file:com.meizu.nlp.classification.utils.DocToDoubleVectorUtilsTest.java

License:Apache License

@Override
@Before//from  ww  w. jav a2 s . com
public void setUp() throws Exception {
    super.setUp();
    dir = newDirectory();
    RandomIndexWriter indexWriter = new RandomIndexWriter(random(), dir);

    FieldType ft = new FieldType(TextField.TYPE_STORED);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorOffsets(true);
    ft.setStoreTermVectorPositions(true);

    Document doc;
    for (int i = 0; i < 10; i++) {
        doc = new Document();
        doc.add(new Field("id", Integer.toString(i), ft));
        doc.add(new Field("text",
                random().nextInt(10) + " " + random().nextInt(10) + " " + random().nextInt(10), ft));
        indexWriter.addDocument(doc);
    }

    indexWriter.commit();

    index = indexWriter.getReader();

    indexWriter.close();
}

From source file:com.o19s.solr.swan.highlight.TermVectorFun.java

License:Apache License

@Test
public void testBlah() throws IOException {
    RAMDirectory ramDir = new RAMDirectory();
    // Index some made up content
    IndexWriterConfig iwf = new IndexWriterConfig(Version.LUCENE_47, new StandardAnalyzer(Version.LUCENE_47));
    IndexWriter writer = new IndexWriter(ramDir, iwf);
    FieldType ft = new FieldType();
    ft.setIndexed(true);// w w  w  . j  a  v a 2 s .c  om
    ft.setTokenized(true);
    ft.setStored(true);
    ft.setStoreTermVectorOffsets(true);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(true);
    ft.freeze();
    for (int i = 0; i < DOCS.length; i++) {
        Document doc = new Document();
        StringField id = new StringField("id", "doc_" + i, StringField.Store.YES);
        doc.add(id);
        // Store both position and offset information
        Field text = new Field("content", DOCS[i], ft);
        //               Field.Index.ANALYZED,
        //               Field.TermVector.WITH_POSITIONS_OFFSETS);
        doc.add(text);
        writer.addDocument(doc);
    }
    //writer.close();
    // Get a searcher
    AtomicReader dr = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(writer, true));
    IndexSearcher searcher = new IndexSearcher(dr);
    // Do a search using SpanQuery
    SpanTermQuery fleeceQ = new SpanTermQuery(new Term("content", "fleece"));
    TopDocs results = searcher.search(fleeceQ, 10);
    for (int i = 0; i < results.scoreDocs.length; i++) {
        ScoreDoc scoreDoc = results.scoreDocs[i];
        System.out.println("Score Doc: " + scoreDoc);
    }
    IndexReader reader = searcher.getIndexReader();
    Bits acceptDocs = null;
    Map<Term, TermContext> termContexts = new HashMap<Term, TermContext>();
    Spans spans = fleeceQ.getSpans(dr.getContext(), acceptDocs, termContexts);

    while (spans.next()) {
        System.out.println("Doc: " + spans.doc() + " Start: " + spans.start() + " End: " + spans.end());
        DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor("content");
        reader.document(spans.doc(), visitor);
        Terms terms = reader.getTermVector(spans.doc(), "content");
        TermsEnum tenum = terms.iterator(null);
        //         AttributeSource as = tenum.attributes();

        while (tenum.next() != null) {
            System.out.println(tenum.term().utf8ToString());
        }
        for (long pos = 0L; pos < spans.end(); pos++) {
            //            tenum.next();
            //            if (tenum.ord()<pos) continue;
            //            System.out.println(tenum.term());
            //            
        }

        reader.document(spans.doc(), visitor);
        //         String[] values = visitor.getDocument().getValues("content");
        //         List<String> a = new ArrayList<String>();
        //         // build up the window
        //         tvm.start = spans.start() - window;
        //         tvm.end = spans.end() + window;
        //         reader.getTermFreqVector(spans.doc(), "content", tvm);
        //         for (WindowEntry entry : tvm.entries.values()) {
        //            System.out.println("Entry: " + entry);
        //         }
        //         // clear out the entries for the next round
        //         tvm.entries.clear();
    }
}