Example usage for org.apache.lucene.index IndexWriter commit

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter commit.

Prototype

@Override
public final long commit() throws IOException

Source Link

Document

Commits all pending changes (added and deleted documents, segment merges, added indexes, etc.) to the index, and syncs all referenced index files, such that a reader will see the changes and the index updates will survive an OS or machine crash or power loss.

Usage

From source file:io.anserini.index.IndexCollection.java

License:Apache License

public void run() throws IOException, InterruptedException {
    final long start = System.nanoTime();
    LOG.info("Starting indexer...");

    int numThreads = args.threads;

    final Directory dir = FSDirectory.open(indexPath);
    final EnglishAnalyzer analyzer = args.keepStopwords ? new EnglishAnalyzer(CharArraySet.EMPTY_SET)
            : new EnglishAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setSimilarity(new BM25Similarity());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    config.setRAMBufferSizeMB(args.memorybufferSize);
    config.setUseCompoundFile(false);//from w ww  .jav  a  2  s .c  om
    config.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, config);

    final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
    final List<Path> segmentPaths = collection.getFileSegmentPaths();

    final int segmentCnt = segmentPaths.size();
    LOG.info(segmentCnt + " files found in " + collectionPath.toString());
    for (int i = 0; i < segmentCnt; i++) {
        executor.execute(new IndexerThread(writer, collection, segmentPaths.get(i)));
    }

    executor.shutdown();

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {
            LOG.info(String.format("%.2f percent completed",
                    (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d));
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    if (segmentCnt != executor.getCompletedTaskCount()) {
        throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount =  "
                + executor.getCompletedTaskCount());
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (args.optimize)
            writer.forceMerge(1);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            // It is possible that this happens... but nothing much we can do at this point,
            // so just log the error and move on.
            LOG.error(e);
        }
    }

    LOG.info("Indexed documents: " + counters.indexedDocuments.get());
    LOG.info("Empty documents: " + counters.emptyDocuments.get());
    LOG.info("Errors: " + counters.errors.get());

    final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    LOG.info("Total " + numIndexed + " documents indexed in "
            + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
}

From source file:io.anserini.index.IndexGov2.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {

    IndexArgs indexArgs = new IndexArgs();

    CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(90));

    try {//  w  w w.ja  v a  2s. c o  m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: IndexGov2" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }

    final String dirPath = indexArgs.index;
    final String dataDir = indexArgs.input;
    final int docCountLimit = indexArgs.doclimit;
    final int numThreads = indexArgs.threads;

    final boolean positions = indexArgs.positions;
    final boolean optimize = indexArgs.optimize;

    final Analyzer a = new EnglishAnalyzer();
    final TrecContentSource trecSource = createGov2Source(dataDir);
    final Directory dir = FSDirectory.open(Paths.get(dirPath));

    LOG.info("Index path: " + dirPath);
    LOG.info("Doc limit: " + (docCountLimit == -1 ? "all docs" : "" + docCountLimit));
    LOG.info("Threads: " + numThreads);
    LOG.info("Positions: " + positions);
    LOG.info("Optimize (merge segments): " + optimize);

    final IndexWriterConfig config = new IndexWriterConfig(a);

    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

    final IndexWriter writer = new IndexWriter(dir, config);
    Gov2IndexThreads threads = new Gov2IndexThreads(writer, positions, trecSource, numThreads, docCountLimit);
    LOG.info("Indexer: start");

    final long t0 = System.currentTimeMillis();

    threads.start();

    while (!threads.done()) {
        Thread.sleep(100);
    }
    threads.stop();

    final long t1 = System.currentTimeMillis();
    LOG.info("Indexer: indexing done (" + (t1 - t0) / 1000.0 + " sec); total " + writer.maxDoc() + " docs");
    if (docCountLimit != -1 && writer.maxDoc() != docCountLimit) {
        throw new RuntimeException("w.maxDoc()=" + writer.maxDoc() + " but expected " + docCountLimit);
    }
    if (threads.failed.get()) {
        throw new RuntimeException("exceptions during indexing");
    }

    final long t2;
    t2 = System.currentTimeMillis();

    final Map<String, String> commitData = new HashMap<String, String>();
    commitData.put("userData", "multi");
    writer.setCommitData(commitData);
    writer.commit();
    final long t3 = System.currentTimeMillis();
    LOG.info("Indexer: commit multi (took " + (t3 - t2) / 1000.0 + " sec)");

    if (optimize) {
        LOG.info("Indexer: merging all segments");
        writer.forceMerge(1);
        final long t4 = System.currentTimeMillis();
        LOG.info("Indexer: segments merged (took " + (t4 - t3) / 1000.0 + " sec)");
    }

    // TODO: Doesn't compile for 5.3.1 - do we actually need this?
    //LOG.info("Indexer: at close: " + writer.segString());
    final long tCloseStart = System.currentTimeMillis();
    writer.close();
    LOG.info("Indexer: close took " + (System.currentTimeMillis() - tCloseStart) / 1000.0 + " sec");
    dir.close();
    final long tFinal = System.currentTimeMillis();
    LOG.info("Indexer: finished (" + (tFinal - t0) / 1000.0 + " sec)");
    LOG.info("Indexer: net bytes indexed " + threads.getBytesIndexed());
    LOG.info("Indexer: " + (threads.getBytesIndexed() / 1024. / 1024. / 1024. / ((tFinal - t0) / 3600000.))
            + " GB/hour plain text");
}

From source file:io.anserini.index.IndexTweetsUpdatePlace.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(HELP_OPTION, "show help"));
    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
    options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors"));

    options.addOption(OptionBuilder.withArgName("collection").hasArg()
            .withDescription("source collection directory").create(COLLECTION_OPTION));
    options.addOption(/*from   w  ww .j  a  va  2s .c o m*/
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids")
            .create(DELETES_OPTION));
    options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION)
            || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(IndexTweetsUpdatePlace.class.getName(), options);
        System.exit(-1);
    }

    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexPath = cmdline.getOptionValue(INDEX_OPTION);

    System.out.println(collectionPath + " " + indexPath);

    LOG.info("collection: " + collectionPath);
    LOG.info("index: " + indexPath);

    long startTime = System.currentTimeMillis();
    File file = new File(collectionPath);
    if (!file.exists()) {
        System.err.println("Error: " + file + " does not exist!");
        System.exit(-1);
    }

    final FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) {
        textOptions.setStoreTermVectors(true);

    }

    final StatusStream stream = new JsonStatusCorpusReader(file);

    final Directory dir = new SimpleFSDirectory(Paths.get(cmdline.getOptionValue(INDEX_OPTION)));
    final IndexWriterConfig config = new IndexWriterConfig(ANALYZER);

    config.setOpenMode(IndexWriterConfig.OpenMode.APPEND);

    final IndexWriter writer = new IndexWriter(dir, config);
    System.out.print("Original # of docs " + writer.numDocs());
    int updateCount = 0;

    Runtime.getRuntime().addShutdownHook(new Thread() {
        public void run() {

            try {
                stream.close();
            } catch (IOException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }

            try {
                writer.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            try {
                dir.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            System.out.println("Shutting down");

        }
    });
    int cnt = 0;
    Status status;
    try {
        while ((status = stream.next()) != null) {

            if (status.getPlace() != null) {

                //               Query q = NumericRangeQuery.newLongRange(TweetStreamReader.StatusField.ID.name, status.getId(),
                //                     status.getId(), true, true);
                //               System.out.print("Deleting docCount="+writer.numDocs());
                //               writer.deleteDocuments(q);
                //               writer.commit();
                //               System.out.print(" Deleted docCount="+writer.numDocs());

                Document doc = new Document();
                doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES));
                doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES));
                doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));

                doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions));

                doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES));
                doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES));
                doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES));
                doc.add(new DoubleField(StatusField.LONGITUDE.name, status.getLongitude(), Store.YES));
                doc.add(new DoubleField(StatusField.LATITUDE.name, status.getlatitude(), Store.YES));
                doc.add(new StringField(StatusField.PLACE.name, status.getPlace(), Store.YES));
                long inReplyToStatusId = status.getInReplyToStatusId();
                if (inReplyToStatusId > 0) {
                    doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId,
                            Field.Store.YES));
                    doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(),
                            Field.Store.YES));
                }

                String lang = status.getLang();
                if (!lang.equals("unknown")) {
                    doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
                }

                long retweetStatusId = status.getRetweetedStatusId();
                if (retweetStatusId > 0) {
                    doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId,
                            Field.Store.YES));
                    doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(),
                            Field.Store.YES));
                    doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES));
                    if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
                        LOG.warn("Error parsing retweet fields of " + status.getId());
                    }
                }

                long id = status.getId();
                BytesRefBuilder brb = new BytesRefBuilder();
                NumericUtils.longToPrefixCodedBytes(id, 0, brb);
                Term term = new Term(StatusField.ID.name, brb.get());
                writer.updateDocument(term, doc);

                //               writer.addDocument(doc);

                updateCount += 1;

                if (updateCount % 10000 == 0) {

                    LOG.info(updateCount + " statuses updated");
                    writer.commit();
                    System.out.println("Updated docCount=" + writer.numDocs());
                }

            }

        }

        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        stream.close();
    }
}

From source file:io.anserini.index.IndexWebCollection.java

License:Apache License

public int indexWithThreads(int numThreads) throws IOException, InterruptedException {

    LOG.info("Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'...");

    final Directory dir = FSDirectory.open(indexPath);

    final IndexWriterConfig iwc = new IndexWriterConfig(new EnglishAnalyzer());

    iwc.setSimilarity(new BM25Similarity());
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(512);/*w  w w.ja va  2  s. c o m*/
    iwc.setUseCompoundFile(false);
    iwc.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, iwc);

    final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
    final String suffix = Collection.GOV2.equals(collection) ? ".gz" : ".warc.gz";
    final Deque<Path> warcFiles = discoverWarcFiles(docDir, suffix);

    if (doclimit > 0 && warcFiles.size() < doclimit)
        for (int i = doclimit; i < warcFiles.size(); i++)
            warcFiles.removeFirst();

    long totalWarcFiles = warcFiles.size();
    LOG.info(totalWarcFiles + " many " + suffix + " files found under the docs path : " + docDir.toString());

    for (int i = 0; i < 2000; i++) {
        if (!warcFiles.isEmpty())
            executor.execute(new IndexerThread(writer, warcFiles.removeFirst()));
        else {
            if (!executor.isShutdown()) {
                Thread.sleep(30000);
                executor.shutdown();
            }
            break;
        }
    }

    long first = 0;
    //add some delay to let some threads spawn by scheduler
    Thread.sleep(30000);

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {

            final long completedTaskCount = executor.getCompletedTaskCount();

            LOG.info(String.format("%.2f percentage completed",
                    (double) completedTaskCount / totalWarcFiles * 100.0d));

            if (!warcFiles.isEmpty())
                for (long i = first; i < completedTaskCount; i++) {
                    if (!warcFiles.isEmpty())
                        executor.execute(new IndexerThread(writer, warcFiles.removeFirst()));
                    else {
                        if (!executor.isShutdown())
                            executor.shutdown();
                    }
                }

            first = completedTaskCount;
            Thread.sleep(1000);
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    if (totalWarcFiles != executor.getCompletedTaskCount())
        throw new RuntimeException("totalWarcFiles = " + totalWarcFiles
                + " is not equal to completedTaskCount =  " + executor.getCompletedTaskCount());

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
        if (optimize)
            writer.forceMerge(1);
    } finally {
        writer.close();
    }

    return numIndexed;
}

From source file:io.anserini.index.UpdateIndex.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(HELP_OPTION, "show help"));
    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
    options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors"));

    options.addOption(//  w w w.  j av  a 2  s .  c  o m
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids")
            .create(DELETES_OPTION));
    options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(UpdateIndex.class.getName(), options);
        System.exit(-1);
    }

    String indexPath = cmdline.getOptionValue(INDEX_OPTION);

    final FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    textOptions.setStoreTermVectors(true);

    LOG.info("index: " + indexPath);

    File file = new File("PittsburghUserTimeline");
    if (!file.exists()) {
        System.err.println("Error: " + file + " does not exist!");
        System.exit(-1);
    }

    final StatusStream stream = new JsonStatusCorpusReader(file);

    Status status;
    String s;
    HashMap<Long, String> hm = new HashMap<Long, String>();
    try {
        while ((s = stream.nextRaw()) != null) {
            try {
                status = DataObjectFactory.createStatus(s);

                if (status.getText() == null) {
                    continue;
                }

                hm.put(status.getUser().getId(),
                        hm.get(status.getUser().getId()) + status.getText().replaceAll("[\\r\\n]+", " "));

            } catch (Exception e) {

            }
        }

    } catch (Exception e) {
        e.printStackTrace();
    } finally {

        stream.close();
    }

    ArrayList<String> userIDList = new ArrayList<String>();
    try (BufferedReader br = new BufferedReader(new FileReader(new File("userID")))) {
        String line;
        while ((line = br.readLine()) != null) {
            userIDList.add(line.replaceAll("[\\r\\n]+", ""));

            // process the line.
        }
    }

    try {
        reader = DirectoryReader
                .open(FSDirectory.open(new File(cmdline.getOptionValue(INDEX_OPTION)).toPath()));
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    final Directory dir = new SimpleFSDirectory(Paths.get(cmdline.getOptionValue(INDEX_OPTION)));
    final IndexWriterConfig config = new IndexWriterConfig(ANALYZER);

    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);

    final IndexWriter writer = new IndexWriter(dir, config);

    IndexSearcher searcher = new IndexSearcher(reader);
    System.out.println("The total number of docs indexed "
            + searcher.collectionStatistics(TweetStreamReader.StatusField.TEXT.name).docCount());

    for (int city = 0; city < cityName.length; city++) {

        // Pittsburgh's coordinate -79.976389, 40.439722

        Query q_long = NumericRangeQuery.newDoubleRange(TweetStreamReader.StatusField.LONGITUDE.name,
                new Double(longitude[city] - 0.05), new Double(longitude[city] + 0.05), true, true);
        Query q_lat = NumericRangeQuery.newDoubleRange(TweetStreamReader.StatusField.LATITUDE.name,
                new Double(latitude[city] - 0.05), new Double(latitude[city] + 0.05), true, true);

        BooleanQuery bqCityName = new BooleanQuery();

        Term t = new Term("place", cityName[city]);
        TermQuery query = new TermQuery(t);
        bqCityName.add(query, BooleanClause.Occur.SHOULD);
        System.out.println(query.toString());

        for (int i = 0; i < cityNameAlias[city].length; i++) {
            t = new Term("place", cityNameAlias[city][i]);
            query = new TermQuery(t);
            bqCityName.add(query, BooleanClause.Occur.SHOULD);
            System.out.println(query.toString());
        }

        BooleanQuery bq = new BooleanQuery();

        BooleanQuery finalQuery = new BooleanQuery();

        // either a coordinate match
        bq.add(q_long, BooleanClause.Occur.MUST);
        bq.add(q_lat, BooleanClause.Occur.MUST);

        finalQuery.add(bq, BooleanClause.Occur.SHOULD);
        // or a place city name match
        finalQuery.add(bqCityName, BooleanClause.Occur.SHOULD);

        TotalHitCountCollector totalHitCollector = new TotalHitCountCollector();

        // Query hasFieldQuery = new ConstantScoreQuery(new
        // FieldValueFilter("timeline"));
        //
        // searcher.search(hasFieldQuery, totalHitCollector);
        //
        // if (totalHitCollector.getTotalHits() > 0) {
        // TopScoreDocCollector collector =
        // TopScoreDocCollector.create(Math.max(0,
        // totalHitCollector.getTotalHits()));
        // searcher.search(finalQuery, collector);
        // ScoreDoc[] hits = collector.topDocs().scoreDocs;
        //
        //
        // HashMap<String, Integer> hasHit = new HashMap<String, Integer>();
        // int dupcount = 0;
        // for (int i = 0; i < hits.length; ++i) {
        // int docId = hits[i].doc;
        // Document d;
        //
        // d = searcher.doc(docId);
        //
        // System.out.println(d.getFields());
        // }
        // }

        // totalHitCollector = new TotalHitCountCollector();
        searcher.search(finalQuery, totalHitCollector);

        if (totalHitCollector.getTotalHits() > 0) {
            TopScoreDocCollector collector = TopScoreDocCollector
                    .create(Math.max(0, totalHitCollector.getTotalHits()));
            searcher.search(finalQuery, collector);
            ScoreDoc[] hits = collector.topDocs().scoreDocs;

            System.out.println("City " + cityName[city] + " " + collector.getTotalHits() + " hits.");

            HashMap<String, Integer> hasHit = new HashMap<String, Integer>();
            int dupcount = 0;
            for (int i = 0; i < hits.length; ++i) {
                int docId = hits[i].doc;
                Document d;

                d = searcher.doc(docId);

                if (userIDList.contains(d.get(IndexTweets.StatusField.USER_ID.name))
                        && hm.containsKey(Long.parseLong(d.get(IndexTweets.StatusField.USER_ID.name)))) {
                    //            System.out.println("Has timeline field?" + (d.get("timeline") != null));
                    //            System.out.println(reader.getDocCount("timeline"));
                    //            d.add(new Field("timeline", hm.get(Long.parseLong(d.get(IndexTweets.StatusField.USER_ID.name))),
                    //                textOptions));
                    System.out.println("Found a user hit");
                    BytesRefBuilder brb = new BytesRefBuilder();
                    NumericUtils.longToPrefixCodedBytes(Long.parseLong(d.get(IndexTweets.StatusField.ID.name)),
                            0, brb);
                    Term term = new Term(IndexTweets.StatusField.ID.name, brb.get());
                    //            System.out.println(reader.getDocCount("timeline"));

                    Document d_new = new Document();
                    //            for (IndexableField field : d.getFields()) {
                    //              d_new.add(field);
                    //            }
                    // System.out.println(d_new.getFields());
                    d_new.add(new StringField("userBackground", d.get(IndexTweets.StatusField.USER_ID.name),
                            Store.YES));
                    d_new.add(new Field("timeline",
                            hm.get(Long.parseLong(d.get(IndexTweets.StatusField.USER_ID.name))), textOptions));
                    // System.out.println(d_new.get());
                    writer.addDocument(d_new);
                    writer.commit();

                    //            t = new Term("label", "why");
                    //            TermQuery tqnew = new TermQuery(t);
                    //
                    //            totalHitCollector = new TotalHitCountCollector();
                    //
                    //            searcher.search(tqnew, totalHitCollector);
                    //
                    //            if (totalHitCollector.getTotalHits() > 0) {
                    //              collector = TopScoreDocCollector.create(Math.max(0, totalHitCollector.getTotalHits()));
                    //              searcher.search(tqnew, collector);
                    //              hits = collector.topDocs().scoreDocs;
                    //
                    //              System.out.println("City " + cityName[city] + " " + collector.getTotalHits() + " hits.");
                    //
                    //              for (int k = 0; k < hits.length; k++) {
                    //                docId = hits[k].doc;
                    //                d = searcher.doc(docId);
                    //                System.out.println(d.get(IndexTweets.StatusField.ID.name));
                    //                System.out.println(d.get(IndexTweets.StatusField.PLACE.name));
                    //              }
                    //            }

                    // writer.deleteDocuments(term);
                    // writer.commit();
                    // writer.addDocument(d);
                    // writer.commit();

                    //            System.out.println(reader.getDocCount("timeline"));
                    // writer.updateDocument(term, d);
                    // writer.commit();

                }

            }
        }
    }
    reader.close();
    writer.close();

}

From source file:io.anserini.IndexerCW09B.java

License:Apache License

public int indexWithThreads(int numThreads) throws IOException, InterruptedException {

    System.out.println(//from   www  .j  a va 2  s .com
            "Indexing with " + numThreads + " threads to directory '" + indexPath.toAbsolutePath() + "'...");

    final Directory dir = FSDirectory.open(indexPath);

    final IndexWriterConfig iwc = new IndexWriterConfig(analyzer());

    iwc.setSimilarity(new BM25Similarity());
    iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(256.0);
    iwc.setUseCompoundFile(false);
    iwc.setMergeScheduler(new ConcurrentMergeScheduler());

    final IndexWriter writer = new IndexWriter(dir, iwc);

    final ExecutorService executor = Executors.newFixedThreadPool(numThreads);

    for (Path f : discoverWarcFiles(docDir))
        executor.execute(new IndexerThread(writer, f));

    //add some delay to let some threads spawn by scheduler
    Thread.sleep(30000);
    executor.shutdown(); // Disable new tasks from being submitted

    try {
        // Wait for existing tasks to terminate
        while (!executor.awaitTermination(5, TimeUnit.MINUTES)) {
            Thread.sleep(1000);
        }
    } catch (InterruptedException ie) {
        // (Re-)Cancel if current thread also interrupted
        executor.shutdownNow();
        // Preserve interrupt status
        Thread.currentThread().interrupt();
    }

    int numIndexed = writer.maxDoc();

    try {
        writer.commit();
    } finally {
        writer.close();
    }

    return numIndexed;
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

private void buildTestIndex() throws IOException {
    Directory dir = FSDirectory.open(tempDir1);

    Analyzer analyzer = new EnglishAnalyzer();
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);

    FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
    textOptions.setStored(true);//  w w  w  .  j a v a2  s .  c o m
    textOptions.setTokenized(true);
    textOptions.setStoreTermVectors(true);
    textOptions.setStoreTermVectorPositions(true);

    Document doc1 = new Document();
    doc1.add(new StringField("docid", "doc1", Field.Store.YES));
    doc1.add(new Field("text", "here is some text here is some more text", textOptions));
    writer.addDocument(doc1);

    Document doc2 = new Document();
    doc2.add(new StringField("docid", "doc2", Field.Store.YES));
    doc2.add(new Field("text", "more text", textOptions));
    writer.addDocument(doc2);

    Document doc3 = new Document();
    doc3.add(new StringField("docid", "doc3", Field.Store.YES));
    doc3.add(new Field("text", "here is a test", textOptions));
    writer.addDocument(doc3);

    writer.commit();
    writer.forceMerge(1);
    writer.close();
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testCloneIndex() throws Exception {
    System.out.println("Cloning index:");
    Directory dir1 = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir1);

    Directory dir2 = FSDirectory.open(tempDir2);
    IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir2, config);

    LeafReader leafReader = reader.leaves().get(0).reader();
    CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader);
    writer.addIndexes(new MyFilterCodecReader(codecReader));
    writer.commit();
    writer.forceMerge(1);//ww  w  .  j  a v  a2s .com
    writer.close();

    reader.close();

    // Open up the cloned index and verify it.
    reader = DirectoryReader.open(dir2);
    assertEquals(3, reader.numDocs());
    assertEquals(1, reader.leaves().size());

    System.out.println("Dumping out postings...");
    dumpPostings(reader);

    assertEquals(2, reader.docFreq(new Term("text", "here")));
    assertEquals(2, reader.docFreq(new Term("text", "more")));
    assertEquals(1, reader.docFreq(new Term("text", "some")));
    assertEquals(1, reader.docFreq(new Term("text", "test")));
    assertEquals(2, reader.docFreq(new Term("text", "text")));

    reader.close();
}

From source file:io.crate.execution.engine.collect.collectors.LuceneBatchIteratorBenchmark.java

License:Apache License

@Setup
public void createLuceneBatchIterator() throws Exception {
    IndexWriter iw = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(new StandardAnalyzer()));
    String columnName = "x";
    for (int i = 0; i < 10_000_000; i++) {
        Document doc = new Document();
        doc.add(new NumericDocValuesField(columnName, i));
        iw.addDocument(doc);//from   w w w . java  2  s .c  o  m
    }
    iw.commit();
    iw.forceMerge(1, true);
    indexSearcher = new IndexSearcher(DirectoryReader.open(iw));
    IntegerColumnReference columnReference = new IntegerColumnReference(columnName);
    columnRefs = Collections.singletonList(columnReference);

    collectorContext = new CollectorContext(mappedFieldType -> null, new CollectorFieldsVisitor(0));
}

From source file:io.crate.execution.engine.collect.collectors.LuceneBatchIteratorTest.java

License:Apache License

@Before
public void prepareSearcher() throws Exception {
    IndexWriter iw = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(new StandardAnalyzer()));
    String columnName = "x";
    expectedResult = new ArrayList<>(20);
    for (long i = 0; i < 20; i++) {
        Document doc = new Document();
        doc.add(new NumericDocValuesField(columnName, i));
        iw.addDocument(doc);/*from ww w .j  a  v a 2s. co m*/
        expectedResult.add(new Object[] { i });
    }
    iw.commit();
    indexSearcher = new IndexSearcher(DirectoryReader.open(iw));
    LongColumnReference columnReference = new LongColumnReference(columnName);
    columnRefs = Collections.singletonList(columnReference);
}