Example usage for org.apache.lucene.index IndexOptions DOCS_AND_FREQS_AND_POSITIONS

List of usage examples for org.apache.lucene.index IndexOptions DOCS_AND_FREQS_AND_POSITIONS

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexOptions DOCS_AND_FREQS_AND_POSITIONS.

Prototype

IndexOptions DOCS_AND_FREQS_AND_POSITIONS

To view the source code for org.apache.lucene.index IndexOptions DOCS_AND_FREQS_AND_POSITIONS.

Click Source Link

Document

Indexes documents, frequencies and positions.

Usage

From source file:com.lucure.core.codec.LucurePostingsWriter.java

License:Apache License

@Override
public int setField(FieldInfo fieldInfo) {
    IndexOptions indexOptions = fieldInfo.getIndexOptions();
    fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
    fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
    fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
    fieldHasPayloads = fieldInfo.hasPayloads();
    skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads);
    lastState = emptyState;//  w w w  . j  av a2s. c  o m
    if (fieldHasPositions) {
        if (fieldHasPayloads || fieldHasOffsets) {
            return 3; // doc + pos + pay FP
        } else {
            return 2; // doc + pos FP
        }
    } else {
        return 1; // doc FP
    }
}

From source file:com.rocana.lucene.codec.v1.RocanaFieldReader.java

License:Apache License

@Override
public boolean hasPositions() {
    return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}

From source file:com.sindicetech.siren.index.codecs.siren10.Siren10PostingsWriter.java

License:Open Source License

@Override
public void addPosition(final int position, final BytesRef payload, final int startOffset, final int endOffset)
        throws IOException {
    assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
    // we always receive node ids in the payload
    assert payload != null;

    // decode payload
    sirenPayload.decode(payload);//from  w w w  .  j a va 2 s .c  om
    final IntsRef node = sirenPayload.getNode();

    // check if we received the same node
    // TODO: we pay the cost of decoding the node before testing the equality
    // we could instead directly compute the node hash based on the byte array
    final int nodeHash = node.hashCode();
    if (lastNodeHash != nodeHash) { // if different node
        // add term freq for previous node if not first payload.
        if (lastNodeHash != Long.MAX_VALUE) {
            this.addTermFreqInNode();
        }
        // add new node
        this.addNode(node);
    }
    lastNodeHash = nodeHash;

    // add position
    this.addPosition(sirenPayload.getPosition());
}

From source file:com.sindicetech.siren.util.SirenTestCase.java

License:Open Source License

private static FieldType newFieldType() {
    final FieldType ft = new FieldType();
    ft.setStored(false);//from  w  ww.  j  av a 2s.  c o m
    ft.setOmitNorms(false);
    ft.setIndexed(true);
    ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    ft.setTokenized(true);
    return ft;
}

From source file:com.vmware.xenon.services.common.Lucene60FieldInfosFormatWithCache.java

License:Open Source License

private static IndexOptions getIndexOptions(IndexInput input, byte b) throws IOException {
    switch (b) {/* www. j av  a2 s  .  c  o m*/
    case 0:
        return IndexOptions.NONE;
    case 1:
        return IndexOptions.DOCS;
    case 2:
        return IndexOptions.DOCS_AND_FREQS;
    case 3:
        return IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
    case 4:
        return IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
    default:
        // BUG
        throw new CorruptIndexException("invalid IndexOptions byte: " + b, input);
    }
}

From source file:edu.umass.cs.ciir.IndexFromGalago.java

License:Open Source License

public static void main(String[] args) throws Exception {
    Parameters argp = Parameters.parseArgs(args);
    String galagoIndexPath = null;
    String luceneIndexPath = null;
    try {//from www.ja v  a 2  s  .  co  m
        galagoIndexPath = argp.getString("galagoIndex");
        luceneIndexPath = argp.getString("luceneIndex");
    } catch (Exception e) {
        System.out.println(getUsage());
        return;
    }

    logger.setUseParentHandlers(false);
    FileHandler lfh = new FileHandler("indexing-errors.log");
    SimpleFormatter formatter = new SimpleFormatter();
    lfh.setFormatter(formatter);
    logger.addHandler(lfh);

    final DiskIndex index = new DiskIndex(argp.get("index", galagoIndexPath));
    final CorpusReader corpus = (CorpusReader) index.getIndexPart("corpus");
    long total = corpus.getManifest().getLong("keyCount");
    final CorpusReader.KeyIterator iterator = corpus.getIterator();

    final Document.DocumentComponents dcp = Document.DocumentComponents.JustText;
    // Analyzer includes options for text processing
    Analyzer analyzer = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            // Step 1: tokenization (Lucene's StandardTokenizer is suitable for most text retrieval occasions)
            TokenStreamComponents ts = new TokenStreamComponents(new StandardTokenizer());
            // Step 2: transforming all tokens into lowercased ones
            ts = new Analyzer.TokenStreamComponents(ts.getTokenizer(),
                    new LowerCaseFilter(ts.getTokenStream()));
            // Step 3: whether to remove stop words
            // Uncomment the following line to remove stop words
            // ts = new TokenStreamComponents( ts.getTokenizer(), new StopwordsFilter( ts.getTokenStream(), StandardAnalyzer.ENGLISH_STOP_WORDS_SET ) );
            // Step 4: whether to apply stemming
            // Uncomment the following line to apply Krovetz or Porter stemmer
            // ts = new TokenStreamComponents( ts.getTokenizer(), new KStemFilter( ts.getTokenStream() ) );
            // ts = new TokenStreamComponents( ts.getTokenizer(), new PorterStemFilter( ts.getTokenStream() ) );
            return ts;
        }
    };

    try (final FSDirectory dir = FSDirectory.open(Paths.get(argp.get("output", luceneIndexPath)))) {
        final IndexWriterConfig cfg = new IndexWriterConfig(analyzer);
        System.out.println("Similarity: " + cfg.getSimilarity());
        cfg.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        try (IndexWriter writer = new IndexWriter(dir, cfg)) {
            iterator.forAllKeyStrings(docId -> {
                try {
                    Document document = iterator.getDocument(dcp);

                    String text = document.text;
                    String id = document.name;
                    System.out.println("Processing document: " + id);
                    org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
                    doc.add(new StringField("id", id, Field.Store.YES));
                    // this stores the actual text with tags so formatting is preserved
                    doc.add(new StoredField("body", text));
                    org.jsoup.nodes.Document jsoup = Jsoup.parse(text);

                    // tokens of the document
                    FieldType fieldTypeText = new FieldType();
                    fieldTypeText.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
                    fieldTypeText.setStoreTermVectors(true);
                    fieldTypeText.setStoreTermVectorPositions(true);
                    fieldTypeText.setTokenized(true);
                    fieldTypeText.setStored(false);
                    fieldTypeText.freeze();
                    doc.add(new Field("tokens", jsoup.text(), fieldTypeText));

                    try {
                        writer.addDocument(doc);
                        System.out.println("Doc count: " + writer.numDocs());
                    } catch (IOException e) {
                        logger.log(Level.WARNING, "Pull-Document-Exception", e);
                        System.err.println(e.toString());
                    }

                } catch (Exception e) {
                    logger.log(Level.WARNING, "Pull-Document-Exception", e);
                    System.err.println(e.toString());
                }
            });

        }
    }

    System.out.println("Indexing Done. ");
}

From source file:io.anserini.index.generator.LuceneDocumentGenerator.java

License:Apache License

public Document createDocument(SourceDocument src) {
    String id = src.id();/*from w  w  w .  java 2s  .  c  o  m*/
    String contents;

    try {
        // If there's a transform, use it.
        contents = transform != null ? transform.apply(src.content()) : src.content();
    } catch (Exception e) {
        LOG.error("Error extracting document text, skipping document: " + id, e);
        counters.errors.incrementAndGet();
        return null;
    }

    if (contents.trim().length() == 0) {
        LOG.info("Empty document: " + id);
        counters.emptyDocuments.incrementAndGet();
        return null;
    }

    // make a new, empty document
    Document document = new Document();

    // document id
    document.add(new StringField(FIELD_ID, id, Field.Store.YES));

    if (args.storeRawDocs) {
        document.add(new StoredField(FIELD_RAW, src.content()));
    }

    FieldType fieldType = new FieldType();

    fieldType.setStored(args.storeTransformedDocs);

    // Are we storing document vectors?
    if (args.storeDocvectors) {
        fieldType.setStoreTermVectors(true);
        fieldType.setStoreTermVectorPositions(true);
    }

    // Are we building a "positional" or "count" index?
    if (args.storePositions) {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    } else {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    }

    document.add(new Field(FIELD_BODY, contents, fieldType));

    return document;
}

From source file:io.anserini.index.generator.TweetGenerator.java

License:Apache License

@Override
public Document createDocument(TweetCollection.Document tweetDoc) {
    String id = tweetDoc.id();//from  ww w .j  a  v a2 s .  c  o  m

    if (tweetDoc.content().trim().isEmpty()) {
        counters.empty.incrementAndGet();
        return null;
    }
    final TwitterTextParseResults result = TwitterTextParser.parseTweet(tweetDoc.content().trim());
    if (!result.isValid) {
        counters.errors.incrementAndGet();
        return null;
    }
    String text = tweetDoc.content().trim().substring(result.validTextRange.start, result.validTextRange.end);

    if (!args.tweetKeepUrls) {
        final Extractor extractor = new Extractor();
        final List<String> urls = extractor.extractURLs(text);
        for (String url : urls) {
            text = text.replaceAll(url, "");
        }
    }
    text = text.trim();
    if (text.isEmpty()) {
        counters.empty.incrementAndGet();
        return null;
    }

    // Skip deletes tweetids.
    if (deletes != null && deletes.contains(id)) {
        counters.skipped.incrementAndGet();
        return null;
    }

    if (tweetDoc.getIdLong() > args.tweetMaxId) {
        counters.skipped.incrementAndGet();
        return null;
    }

    if (!args.tweetKeepRetweets && tweetDoc.getRetweetedStatusId().isPresent()) {
        counters.skipped.incrementAndGet();
        return null;
    }

    Document doc = new Document();
    doc.add(new StringField(FIELD_ID, id, Field.Store.YES));

    // We need this to break scoring ties.
    doc.add(new LongPoint(StatusField.ID_LONG.name, tweetDoc.getIdLong()));
    doc.add(new NumericDocValuesField(StatusField.ID_LONG.name, tweetDoc.getIdLong()));

    tweetDoc.getEpoch().ifPresent(epoch -> doc.add(new LongPoint(StatusField.EPOCH.name, epoch)));
    doc.add(new StringField(StatusField.SCREEN_NAME.name, tweetDoc.getScreenName(), Field.Store.NO));
    doc.add(new IntPoint(StatusField.FRIENDS_COUNT.name, tweetDoc.getFollowersCount()));
    doc.add(new IntPoint(StatusField.FOLLOWERS_COUNT.name, tweetDoc.getFriendsCount()));
    doc.add(new IntPoint(StatusField.STATUSES_COUNT.name, tweetDoc.getStatusesCount()));

    tweetDoc.getInReplyToStatusId().ifPresent(rid -> {
        doc.add(new LongPoint(StatusField.IN_REPLY_TO_STATUS_ID.name, rid));
        tweetDoc.getInReplyToUserId()
                .ifPresent(ruid -> doc.add(new LongPoint(StatusField.IN_REPLY_TO_USER_ID.name, ruid)));
    });

    tweetDoc.getRetweetedStatusId().ifPresent(rid -> {
        doc.add(new LongPoint(StatusField.RETWEETED_STATUS_ID.name, rid));
        tweetDoc.getRetweetedUserId()
                .ifPresent(ruid -> doc.add(new LongPoint(StatusField.RETWEETED_USER_ID.name, ruid)));
        tweetDoc.getRetweetCount().ifPresent(rc -> doc.add(new LongPoint(StatusField.RETWEET_COUNT.name, rc)));
    });

    tweetDoc.getLang().ifPresent(lang -> doc.add(new StringField(StatusField.LANG.name, lang, Field.Store.NO)));

    if (args.storeRawDocs) { // store the raw json string as one single field
        doc.add(new StoredField(FIELD_RAW, tweetDoc.getJsonString()));
    }

    FieldType fieldType = new FieldType();

    fieldType.setStored(args.storeTransformedDocs);

    // Are we storing document vectors?
    if (args.storeDocvectors) {
        fieldType.setStoreTermVectors(true);
        fieldType.setStoreTermVectorPositions(true);
    }

    // Are we building a "positional" or "count" index?
    if (args.storePositions) {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    } else {
        fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    }

    doc.add(new Field(FIELD_BODY, text, fieldType));

    return doc;
}

From source file:io.anserini.index.IndexTweets.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(HELP_OPTION, "show help"));
    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
    options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors"));

    options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory")
            .create(COLLECTION_OPTION));
    options.addOption(/* w w  w. j  a va2s. c o m*/
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids")
            .create(DELETES_OPTION));
    options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION)
            || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(IndexTweets.class.getName(), options);
        System.exit(-1);
    }

    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexPath = cmdline.getOptionValue(INDEX_OPTION);

    final FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) {
        textOptions.setStoreTermVectors(true);
    }

    LOG.info("collection: " + collectionPath);
    LOG.info("index: " + indexPath);

    LongOpenHashSet deletes = null;
    if (cmdline.hasOption(DELETES_OPTION)) {
        deletes = new LongOpenHashSet();
        File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION));
        if (!deletesFile.exists()) {
            System.err.println("Error: " + deletesFile + " does not exist!");
            System.exit(-1);
        }
        LOG.info("Reading deletes from " + deletesFile);

        FileInputStream fin = new FileInputStream(deletesFile);
        byte[] ignoreBytes = new byte[2];
        fin.read(ignoreBytes); // "B", "Z" bytes from commandline tools
        BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin)));

        String s;
        while ((s = br.readLine()) != null) {
            if (s.contains("\t")) {
                deletes.add(Long.parseLong(s.split("\t")[0]));
            } else {
                deletes.add(Long.parseLong(s));
            }
        }
        br.close();
        fin.close();
        LOG.info("Read " + deletes.size() + " tweetids from deletes file.");
    }

    long maxId = Long.MAX_VALUE;
    if (cmdline.hasOption(MAX_ID_OPTION)) {
        maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION));
        LOG.info("index: " + maxId);
    }

    long startTime = System.currentTimeMillis();
    File file = new File(collectionPath);
    if (!file.exists()) {
        System.err.println("Error: " + file + " does not exist!");
        System.exit(-1);
    }

    StatusStream stream = new JsonStatusCorpusReader(file);

    Directory dir = FSDirectory.open(Paths.get(indexPath));
    final IndexWriterConfig config = new IndexWriterConfig(ANALYZER);

    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);
    int cnt = 0;
    Status status;
    try {
        while ((status = stream.next()) != null) {
            if (status.getText() == null) {
                continue;
            }

            // Skip deletes tweetids.
            if (deletes != null && deletes.contains(status.getId())) {
                continue;
            }

            if (status.getId() > maxId) {
                continue;
            }

            cnt++;
            Document doc = new Document();
            doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES));
            doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES));
            doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));

            doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions));

            doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES));
            doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES));
            doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES));

            long inReplyToStatusId = status.getInReplyToStatusId();
            if (inReplyToStatusId > 0) {
                doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId,
                        Field.Store.YES));
                doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(),
                        Field.Store.YES));
            }

            String lang = status.getLang();
            if (!lang.equals("unknown")) {
                doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
            }

            long retweetStatusId = status.getRetweetedStatusId();
            if (retweetStatusId > 0) {
                doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES));
                doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(),
                        Field.Store.YES));
                doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES));
                if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
                    LOG.warn("Error parsing retweet fields of " + status.getId());
                }
            }

            writer.addDocument(doc);
            if (cnt % 100000 == 0) {
                LOG.info(cnt + " statuses indexed");
            }
        }

        LOG.info(String.format("Total of %s statuses added", cnt));

        if (cmdline.hasOption(OPTIMIZE_OPTION)) {
            LOG.info("Merging segments...");
            writer.forceMerge(1);
            LOG.info("Done!");
        }

        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        stream.close();
    }
}

From source file:io.anserini.index.IndexTweetsUpdatePlace.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(HELP_OPTION, "show help"));
    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
    options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors"));

    options.addOption(OptionBuilder.withArgName("collection").hasArg()
            .withDescription("source collection directory").create(COLLECTION_OPTION));
    options.addOption(// w  w  w .  ja v  a2  s .c  o  m
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids")
            .create(DELETES_OPTION));
    options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION)
            || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(IndexTweetsUpdatePlace.class.getName(), options);
        System.exit(-1);
    }

    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexPath = cmdline.getOptionValue(INDEX_OPTION);

    System.out.println(collectionPath + " " + indexPath);

    LOG.info("collection: " + collectionPath);
    LOG.info("index: " + indexPath);

    long startTime = System.currentTimeMillis();
    File file = new File(collectionPath);
    if (!file.exists()) {
        System.err.println("Error: " + file + " does not exist!");
        System.exit(-1);
    }

    final FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) {
        textOptions.setStoreTermVectors(true);

    }

    final StatusStream stream = new JsonStatusCorpusReader(file);

    final Directory dir = new SimpleFSDirectory(Paths.get(cmdline.getOptionValue(INDEX_OPTION)));
    final IndexWriterConfig config = new IndexWriterConfig(ANALYZER);

    config.setOpenMode(IndexWriterConfig.OpenMode.APPEND);

    final IndexWriter writer = new IndexWriter(dir, config);
    System.out.print("Original # of docs " + writer.numDocs());
    int updateCount = 0;

    Runtime.getRuntime().addShutdownHook(new Thread() {
        public void run() {

            try {
                stream.close();
            } catch (IOException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }

            try {
                writer.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            try {
                dir.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            System.out.println("Shutting down");

        }
    });
    int cnt = 0;
    Status status;
    try {
        while ((status = stream.next()) != null) {

            if (status.getPlace() != null) {

                //               Query q = NumericRangeQuery.newLongRange(TweetStreamReader.StatusField.ID.name, status.getId(),
                //                     status.getId(), true, true);
                //               System.out.print("Deleting docCount="+writer.numDocs());
                //               writer.deleteDocuments(q);
                //               writer.commit();
                //               System.out.print(" Deleted docCount="+writer.numDocs());

                Document doc = new Document();
                doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES));
                doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES));
                doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));

                doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions));

                doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES));
                doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES));
                doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES));
                doc.add(new DoubleField(StatusField.LONGITUDE.name, status.getLongitude(), Store.YES));
                doc.add(new DoubleField(StatusField.LATITUDE.name, status.getlatitude(), Store.YES));
                doc.add(new StringField(StatusField.PLACE.name, status.getPlace(), Store.YES));
                long inReplyToStatusId = status.getInReplyToStatusId();
                if (inReplyToStatusId > 0) {
                    doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId,
                            Field.Store.YES));
                    doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(),
                            Field.Store.YES));
                }

                String lang = status.getLang();
                if (!lang.equals("unknown")) {
                    doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
                }

                long retweetStatusId = status.getRetweetedStatusId();
                if (retweetStatusId > 0) {
                    doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId,
                            Field.Store.YES));
                    doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(),
                            Field.Store.YES));
                    doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES));
                    if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
                        LOG.warn("Error parsing retweet fields of " + status.getId());
                    }
                }

                long id = status.getId();
                BytesRefBuilder brb = new BytesRefBuilder();
                NumericUtils.longToPrefixCodedBytes(id, 0, brb);
                Term term = new Term(StatusField.ID.name, brb.get());
                writer.updateDocument(term, doc);

                //               writer.addDocument(doc);

                updateCount += 1;

                if (updateCount % 10000 == 0) {

                    LOG.info(updateCount + " statuses updated");
                    writer.commit();
                    System.out.println("Updated docCount=" + writer.numDocs());
                }

            }

        }

        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        stream.close();
    }
}