Example usage for org.apache.lucene.document FieldType setStoreTermVectors

List of usage examples for org.apache.lucene.document FieldType setStoreTermVectors

Introduction

In this page you can find the example usage for org.apache.lucene.document FieldType setStoreTermVectors.

Prototype

public void setStoreTermVectors(boolean value) 

Source Link

Document

Set to true if this field's indexed form should be also stored into term vectors.

Usage

From source file:alix.lucene.Alix.java

License:Open Source License

/**
 * Parse field type String// w ww.j a v  a  2  s  . c o m
 * 
 * @param name Name of the field
 * @param value Value of the field
 * @param options a string composed of letters in any order following Luke convention to describe fields
 * IdfpoPSV
 * I: Indexed
 * d: docs
 * f: freqs
 * p: pos
 * o: offset
 * P: payloads
 * S: Stored
 * V: TermVector
 */
public static FieldType fieldType(String options) {
    FieldType type;
    if (options == null)
        return new FieldType();
    if ("S".equals(options)) {
        type = new FieldType();
        type.setStored(true);
        return type;
    }
    if (options.contains("S")) {
        type = new FieldType(TextField.TYPE_STORED);
    } else {
        type = new FieldType(TextField.TYPE_NOT_STORED);
    }
    // optimize ?
    type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
    if (options.contains("p")) {
        type.setStoreTermVectorPositions(true);
    }

    if (options.contains("o")) {
        type.setTokenized(true);
        type.setStoreTermVectors(true);
        type.setStoreTermVectorOffsets(true);
    }
    if (options.contains("P")) {
        type.setTokenized(true);
        type.setStoreTermVectors(true);
        type.setStoreTermVectorPositions(true);
        type.setStoreTermVectorPayloads(true);
    }
    if (options.contains("V")) {
        type.setTokenized(true);
        type.setStoreTermVectors(true);
    }
    return type;
}

From source file:api.startup.PDFIndexer.java

License:Open Source License

/**
 * Indexes a single document and writes it to the given index writer
 * @param writer - the index writer to writer
 * @param metadata - the document//from w ww  . j a  v a2 s  .co m
 * @throws IOException
 */
static void indexDoc(IndexWriter writer, DocumentMetadata metadata) throws IOException {
    Path file = Paths.get(metadata.getFilename());
    try {
        Document doc = new Document();

        Field pathField = new StringField(Constants.FIELD_PATH, file.toString(), Field.Store.YES);
        doc.add(pathField);

        // Add Document metadata //
        doc.add(new StringField(Constants.FIELD_AUTHOR, metadata.getAuthor(), Field.Store.YES));
        doc.add(new StringField(Constants.FIELD_TITLE, metadata.getTitle(), Field.Store.YES));
        doc.add(new StringField(Constants.FIELD_CONFERENCE, metadata.getConference(), Field.Store.YES));
        // End of Document Metadata //

        Field modified = new LongField(Constants.FIELD_MODIFIED, Files.getLastModifiedTime(file).toMillis(),
                Field.Store.YES);
        doc.add(modified);

        PDFTextExtractor extractor = new PDFTextExtractor();
        // Get the string contents
        String textContents = extractor.extractText(file.toString());

        // Store the string contents
        FieldType contentsType = new FieldType();
        contentsType.setStored(true);
        contentsType.setTokenized(true);
        contentsType.setStoreTermVectors(true);
        contentsType.setStoreTermVectorPositions(true);
        contentsType.setStoreTermVectorPayloads(true);
        contentsType.setStoreTermVectorOffsets(true);
        contentsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        Field contents = new Field(Constants.FIELD_CONTENTS, textContents, contentsType);
        doc.add(contents);

        if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            log.info("adding " + file + " to index");
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so
            // we use updateDocument instead to replace the old one matching the exact
            // path, if present:
            log.info("updating " + file + " in index");
            writer.updateDocument(new Term(Constants.FIELD_PATH, file.toString()), doc);
        }
    } catch (IOException e) {
        log.error("Failed to read file " + metadata.getFilename());
    }

}

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.utils.NewsItemLuceneDocConverter.java

License:Apache License

private static FieldType getTextType() {
    FieldType ftype = new FieldType();
    ftype.setIndexed(true);/*from   w w w . j a  v  a  2s. c o m*/
    ftype.setStoreTermVectors(true);
    ftype.setStored(true);
    ftype.freeze();
    return ftype;
}

From source file:be.ugent.tiwi.sleroux.newsrec.stormNewsFetch.storm.bolts.LuceneIndexBolt.java

License:Apache License

private FieldType getType() {
    FieldType ftype = new FieldType();
    ftype.setIndexed(true);//w w  w . ja  v  a  2s  . c  o  m
    ftype.setStoreTermVectors(true);
    ftype.setStored(true);
    return ftype;
}

From source file:cc.twittertools.index.IndexStatuses.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(HELP_OPTION, "show help"));
    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
    options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors"));

    options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory")
            .create(COLLECTION_OPTION));
    options.addOption(//from   w  ww  . j a va2s .  c  o m
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids")
            .create(DELETES_OPTION));
    options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION)
            || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(IndexStatuses.class.getName(), options);
        System.exit(-1);
    }

    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexPath = cmdline.getOptionValue(INDEX_OPTION);

    final FieldType textOptions = new FieldType();
    textOptions.setIndexed(true);
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) {
        textOptions.setStoreTermVectors(true);
    }

    LOG.info("collection: " + collectionPath);
    LOG.info("index: " + indexPath);

    LongOpenHashSet deletes = null;
    if (cmdline.hasOption(DELETES_OPTION)) {
        deletes = new LongOpenHashSet();
        File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION));
        if (!deletesFile.exists()) {
            System.err.println("Error: " + deletesFile + " does not exist!");
            System.exit(-1);
        }
        LOG.info("Reading deletes from " + deletesFile);

        FileInputStream fin = new FileInputStream(deletesFile);
        byte[] ignoreBytes = new byte[2];
        fin.read(ignoreBytes); // "B", "Z" bytes from commandline tools
        BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin)));

        String s;
        while ((s = br.readLine()) != null) {
            if (s.contains("\t")) {
                deletes.add(Long.parseLong(s.split("\t")[0]));
            } else {
                deletes.add(Long.parseLong(s));
            }
        }
        br.close();
        fin.close();
        LOG.info("Read " + deletes.size() + " tweetids from deletes file.");
    }

    long maxId = Long.MAX_VALUE;
    if (cmdline.hasOption(MAX_ID_OPTION)) {
        maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION));
        LOG.info("index: " + maxId);
    }

    long startTime = System.currentTimeMillis();
    File file = new File(collectionPath);
    if (!file.exists()) {
        System.err.println("Error: " + file + " does not exist!");
        System.exit(-1);
    }

    StatusStream stream = new JsonStatusCorpusReader(file);

    Directory dir = FSDirectory.open(new File(indexPath));
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, IndexStatuses.ANALYZER);
    config.setOpenMode(OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);
    int cnt = 0;
    Status status;
    try {
        while ((status = stream.next()) != null) {
            if (status.getText() == null) {
                continue;
            }

            // Skip deletes tweetids.
            if (deletes != null && deletes.contains(status.getId())) {
                continue;
            }

            if (status.getId() > maxId) {
                continue;
            }

            cnt++;
            Document doc = new Document();
            doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES));
            doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES));
            doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));

            doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions));

            doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES));
            doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES));
            doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES));

            long inReplyToStatusId = status.getInReplyToStatusId();
            if (inReplyToStatusId > 0) {
                doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId,
                        Field.Store.YES));
                doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(),
                        Field.Store.YES));
            }

            String lang = status.getLang();
            if (!lang.equals("unknown")) {
                doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
            }

            long retweetStatusId = status.getRetweetedStatusId();
            if (retweetStatusId > 0) {
                doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES));
                doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(),
                        Field.Store.YES));
                doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES));
                if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
                    LOG.warn("Error parsing retweet fields of " + status.getId());
                }
            }

            writer.addDocument(doc);
            if (cnt % 100000 == 0) {
                LOG.info(cnt + " statuses indexed");
            }
        }

        LOG.info(String.format("Total of %s statuses added", cnt));

        if (cmdline.hasOption(OPTIMIZE_OPTION)) {
            LOG.info("Merging segments...");
            writer.forceMerge(1);
            LOG.info("Done!");
        }

        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        stream.close();
    }
}

From source file:ci6226.buildindex.java

/**
 * @param args the command line arguments
 *///w  w  w .j  ava 2 s  .  c o m
public static void main(String[] args) throws FileNotFoundException, IOException, ParseException {
    String file = "/home/steven/Dropbox/workspace/ntu_coursework/ci6226/Assiment/yelpdata/yelp_training_set/yelp_training_set_review.json";
    JSONParser parser = new JSONParser();

    BufferedReader in = new BufferedReader(new FileReader(file));
    //  List<Document> jdocs = new LinkedList<Document>();
    Date start = new Date();
    String indexPath = "./myindex";
    System.out.println("Indexing to directory '" + indexPath + "'...");
    // Analyzer analyzer= new NGramAnalyzer(2,8);
    Analyzer analyzer = new myAnalyzer();

    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer);
    Directory dir = FSDirectory.open(new File(indexPath));
    // :Post-Release-Update-Version.LUCENE_XY:
    // TODO: try different analyzer,stop words,words steming check size
    //   Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);

    // Add new documents to an existing index:
    // iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
    iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
    // Optional: for better indexing performance, if you
    // are indexing many documents, increase the RAM
    // buffer.  But if you do this, increase the max heap
    // size to the JVM (eg add -Xmx512m or -Xmx1g):
    //
    // iwc.setRAMBufferSizeMB(256.0);
    IndexWriter writer = new IndexWriter(dir, iwc);
    //  writer.addDocuments(jdocs);
    int line = 0;
    while (in.ready()) {
        String s = in.readLine();
        Object obj = JSONValue.parse(s);
        JSONObject person = (JSONObject) obj;
        String text = (String) person.get("text");
        String user_id = (String) person.get("user_id");
        String business_id = (String) person.get("business_id");
        String review_id = (String) person.get("review_id");
        JSONObject votes = (JSONObject) person.get("votes");
        long funny = (Long) votes.get("funny");
        long cool = (Long) votes.get("cool");
        long useful = (Long) votes.get("useful");
        Document doc = new Document();
        Field review_idf = new StringField("review_id", review_id, Field.Store.YES);
        doc.add(review_idf);
        Field business_idf = new StringField("business_id", business_id, Field.Store.YES);
        doc.add(business_idf);

        //http://qindongliang1922.iteye.com/blog/2030639
        FieldType ft = new FieldType();
        ft.setIndexed(true);//  
        ft.setStored(true);//  
        ft.setStoreTermVectors(true);
        ft.setTokenized(true);
        ft.setStoreTermVectorPositions(true);//?  
        ft.setStoreTermVectorOffsets(true);//???  

        Field textf = new Field("text", text, ft);

        doc.add(textf);
        //    Field user_idf = new StringField("user_id", user_id, Field.Store.YES);
        //     doc.add(user_idf);
        //      doc.add(new LongField("cool", cool, Field.Store.YES));
        //      doc.add(new LongField("funny", funny, Field.Store.YES));
        //       doc.add(new LongField("useful", useful, Field.Store.YES));

        writer.addDocument(doc);

        System.out.println(line++);
    }

    writer.close();
    Date end = new Date();
    System.out.println(end.getTime() - start.getTime() + " total milliseconds");
    // BufferedReader in = new BufferedReader(new FileReader(file));
    //while (in.ready()) {
    //  String s = in.readLine();
    //  //System.out.println(s);
    // JSONObject jsonObject = (JSONObject) ((Object)s);
    //      String rtext = (String) jsonObject.get("text");
    //      System.out.println(rtext);
    //      //long age = (Long) jsonObject.get("age");
    //      //System.out.println(age);
    //}
    //in.close();
}

From source file:ci6226.eval_index_writer.java

public eval_index_writer(Analyzer _analyzer, String _iReviewLocation, String _dir) throws IOException {
    String file = _iReviewLocation;
    JSONParser parser = new JSONParser();
    BufferedReader in = new BufferedReader(new FileReader(file));
    Date start = new Date();
    String indexPath = "./" + _dir;
    System.out.println("Indexing to directory '" + indexPath + "'...");
    Analyzer analyzer = _analyzer;//from w  w w .j av a 2s  .  com
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer);
    Directory dir = FSDirectory.open(new File(indexPath));
    iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
    IndexWriter writer = new IndexWriter(dir, iwc);
    //  int line=0;
    while (in.ready()) {
        String s = in.readLine();
        Object obj = JSONValue.parse(s);
        JSONObject person = (JSONObject) obj;
        String text = (String) person.get("text");
        String user_id = (String) person.get("user_id");
        String business_id = (String) person.get("business_id");
        String review_id = (String) person.get("review_id");
        JSONObject votes = (JSONObject) person.get("votes");
        long funny = (Long) votes.get("funny");
        long cool = (Long) votes.get("cool");
        long useful = (Long) votes.get("useful");
        Document doc = new Document();
        Field review_idf = new StringField("review_id", review_id, Field.Store.YES);
        doc.add(review_idf);
        //    Field business_idf = new StringField("business_id", business_id, Field.Store.YES);
        //     doc.add(business_idf);

        //http://qindongliang1922.iteye.com/blog/2030639
        FieldType ft = new FieldType();
        ft.setIndexed(true);//
        ft.setStored(true);//
        ft.setStoreTermVectors(true);
        ft.setTokenized(true);
        ft.setStoreTermVectorPositions(true);//
        ft.setStoreTermVectorOffsets(true);//

        Field textf = new Field("text", text, ft);

        doc.add(textf);
        //    Field user_idf = new StringField("user_id", user_id, Field.Store.YES);
        //     doc.add(user_idf);
        //      doc.add(new LongField("cool", cool, Field.Store.YES));
        //      doc.add(new LongField("funny", funny, Field.Store.YES));
        //       doc.add(new LongField("useful", useful, Field.Store.YES));

        writer.addDocument(doc);

        //  System.out.println(line++);
    }

    writer.close();
    Date end = new Date();
    System.out.println(end.getTime() - start.getTime() + " total milliseconds");
}

From source file:com.github.hotware.lucene.extension.bean.field.BeanInformationCacheImpl.java

License:BEER-WARE LICENSE

private FieldInformation buildFieldInformation(BeanField bf, Field field, Class<?> fieldClass) {
    com.github.hotware.lucene.extension.bean.type.Type typeWrapper;
    try {//from w ww. jav a  2 s.  c  o m
        // TODO: maybe cache these?
        typeWrapper = (com.github.hotware.lucene.extension.bean.type.Type) bf.type().newInstance();
    } catch (InstantiationException | IllegalAccessException e) {
        throw new RuntimeException(e);
    }
    FieldType fieldType = new FieldType();
    fieldType.setIndexed(bf.index());
    fieldType.setStored(bf.store());
    fieldType.setTokenized(bf.tokenized());
    fieldType.setStoreTermVectors(bf.storeTermVectors());
    fieldType.setStoreTermVectorPositions(bf.storeTermVectorPositions());
    fieldType.setStoreTermVectorOffsets(bf.storeTermVectorOffsets());
    fieldType.setStoreTermVectorPayloads(bf.storeTermVectorPayloads());
    fieldType.setOmitNorms(bf.omitNorms());
    fieldType.setIndexOptions(bf.indexOptions());
    typeWrapper.configureFieldType(fieldType);
    fieldType.freeze();
    return new FieldInformation(new FrozenField(field), fieldClass, fieldType, bf);
}

From source file:com.github.hotware.lucene.extension.highlight.BaseObjectFragmentsBuilder.java

License:BEER-WARE LICENSE

private static Field[] getFields(IndexReader reader, int docId, final String fieldName) throws IOException {
    // according to javadoc, doc.getFields(fieldName) cannot be used with
    // lazy loaded field???
    final List<Field> fields = new ArrayList<>();
    reader.document(docId, new StoredFieldVisitor() {

        @Override//ww  w .  j  a  v  a 2 s.co  m
        public void stringField(FieldInfo fieldInfo, String value) {
            FieldType ft = new FieldType(TextField.TYPE_STORED);
            ft.setStoreTermVectors(fieldInfo.hasVectors());
            fields.add(new Field(fieldInfo.name, value, ft));
        }

        @Override
        public Status needsField(FieldInfo fieldInfo) {
            return fieldInfo.name.equals(fieldName) ? Status.YES : Status.NO;
        }

    });
    return fields.toArray(new Field[fields.size()]);
}

From source file:com.meizu.nlp.classification.ClassificationTestBase.java

License:Apache License

private void populatePerformanceIndex(Analyzer analyzer) throws IOException {
    indexWriter.close();// w w w .j  ava2s . c o m
    indexWriter = new RandomIndexWriter(random(), dir,
            newIndexWriterConfig(analyzer).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
    indexWriter.commit();

    FieldType ft = new FieldType(TextField.TYPE_STORED);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorOffsets(true);
    ft.setStoreTermVectorPositions(true);
    int docs = 1000;
    Random random = random();
    for (int i = 0; i < docs; i++) {
        boolean b = random.nextBoolean();
        Document doc = new Document();
        doc.add(new Field(textFieldName, createRandomString(random), ft));
        doc.add(new Field(categoryFieldName, b ? "technology" : "politics", ft));
        doc.add(new Field(booleanFieldName, String.valueOf(b), ft));
        indexWriter.addDocument(doc);
    }
    indexWriter.commit();
}