Example usage for org.apache.lucene.index IndexWriter IndexWriter

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter IndexWriter.

Prototype

public IndexWriter(Directory d, IndexWriterConfig conf) throws IOException

Source Link

Document

Constructs a new IndexWriter per the settings given in conf.

Usage

From source file:ca.ualberta.entitylinking.common.indexing.WikipediaIndex.java

License:Open Source License

public void initWriter(String indexDir, boolean create) {
    try {/*from  w  ww  . j  a v a  2s. co m*/
        Directory dir = FSDirectory.open(new File(indexDir));
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_34,
                new StandardAnalyzer(Version.LUCENE_34));

        // create a new index
        if (create)
            iwc.setOpenMode(OpenMode.CREATE);
        else
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);

        writer = new IndexWriter(dir, iwc);
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:calliope.search.AeseSearch.java

License:Open Source License

/**
 * Update the index for just ONE docID     
 * @param docID the documents to regenerate
 * @param langCode the language code of the analyzer
 * @throws AeseException //from www . ja  va  2 s .c o  m
 */
public static void updateIndex(String docID, String langCode) throws AeseException {
    try {
        Analyzer analyzer = createAnalyzer(langCode);
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, analyzer);
        if (index == null)
            throw new AeseException("Index must be initialised before update");
        IndexWriter w = new IndexWriter(index, config);
        Term t = new Term(LuceneFields.DOCID, docID);
        w.deleteDocuments(t);
        addCorTextoIndex(docID, w);
        w.close();
    } catch (Exception e) {
        throw new AeseException(e);
    }
}

From source file:calliope.search.AeseSearch.java

License:Open Source License

/**
 * Build the entire Lucene index from scratch
 * @param langCode the language code//from  ww  w.j  a va 2  s .c om
 */
public static void buildIndex(String langCode) throws AeseException {
    try {
        String[] docIDs = Connector.getConnection().listCollection(Database.CORTEX);
        Analyzer analyzer = createAnalyzer(langCode);
        File home = new File(System.getProperty("user.home"));
        indexLocation = new File(home, ".calliope");
        if (!indexLocation.exists())
            indexLocation.mkdir();
        AeseSearch.index = new NIOFSDirectory(indexLocation);
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, analyzer);
        IndexWriter w = new IndexWriter(index, config);
        for (int i = 0; i < docIDs.length; i++) {
            addCorTextoIndex(docIDs[i], w);
        }
        w.close();
    } catch (Exception e) {
        throw new AeseException(e);
    }
}

From source file:cc.pp.analyzer.ik.demo.IKAnalyzerDemo.java

License:Apache License

public static void main(String[] args) {

    //Lucene Document??
    String fieldName = "text";
    ////from  w w  w  .j a va  2  s .com
    String text = "IK Analyzer???????";

    //IKAnalyzer?
    Analyzer analyzer = new IKAnalyzer(Version.LUCENE_48, true);

    Directory directory = null;
    IndexWriter iwriter = null;
    DirectoryReader ireader = null;
    IndexSearcher isearcher = null;
    try {
        //
        directory = new RAMDirectory();

        //?IndexWriterConfig
        IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_48, analyzer);
        iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
        iwriter = new IndexWriter(directory, iwConfig);
        //
        Document doc = new Document();
        doc.add(new LongField("ID", 1000, Field.Store.YES));
        doc.add(new TextField(fieldName, text, Field.Store.YES));
        iwriter.addDocument(doc);
        iwriter.close();

        //?**********************************
        //?
        ireader = DirectoryReader.open(directory);
        isearcher = new IndexSearcher(ireader);

        String keyword = "?";
        //         String keyword = "";
        //QueryParser?Query
        QueryParser qp = new QueryParser(Version.LUCENE_48, fieldName, analyzer);
        qp.setDefaultOperator(QueryParser.AND_OPERATOR);
        Query query = qp.parse(keyword);
        System.out.println("Query = " + query);

        //?5?
        TopDocs topDocs = isearcher.search(query, 5);
        System.out.println("" + topDocs.totalHits);
        //
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (int i = 0; i < topDocs.totalHits; i++) {
            Document targetDoc = isearcher.doc(scoreDocs[i].doc);
            System.out.println("" + targetDoc.toString());
        }

    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (LockObtainFailedException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (ParseException e) {
        e.printStackTrace();
    } finally {
        if (ireader != null) {
            try {
                ireader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        if (directory != null) {
            try {
                directory.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:cc.twittertools.index.IndexStatuses.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(HELP_OPTION, "show help"));
    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
    options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors"));

    options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory")
            .create(COLLECTION_OPTION));
    options.addOption(/* ww  w.j a va2 s.c om*/
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids")
            .create(DELETES_OPTION));
    options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION)
            || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(IndexStatuses.class.getName(), options);
        System.exit(-1);
    }

    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexPath = cmdline.getOptionValue(INDEX_OPTION);

    final FieldType textOptions = new FieldType();
    textOptions.setIndexed(true);
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) {
        textOptions.setStoreTermVectors(true);
    }

    LOG.info("collection: " + collectionPath);
    LOG.info("index: " + indexPath);

    LongOpenHashSet deletes = null;
    if (cmdline.hasOption(DELETES_OPTION)) {
        deletes = new LongOpenHashSet();
        File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION));
        if (!deletesFile.exists()) {
            System.err.println("Error: " + deletesFile + " does not exist!");
            System.exit(-1);
        }
        LOG.info("Reading deletes from " + deletesFile);

        FileInputStream fin = new FileInputStream(deletesFile);
        byte[] ignoreBytes = new byte[2];
        fin.read(ignoreBytes); // "B", "Z" bytes from commandline tools
        BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin)));

        String s;
        while ((s = br.readLine()) != null) {
            if (s.contains("\t")) {
                deletes.add(Long.parseLong(s.split("\t")[0]));
            } else {
                deletes.add(Long.parseLong(s));
            }
        }
        br.close();
        fin.close();
        LOG.info("Read " + deletes.size() + " tweetids from deletes file.");
    }

    long maxId = Long.MAX_VALUE;
    if (cmdline.hasOption(MAX_ID_OPTION)) {
        maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION));
        LOG.info("index: " + maxId);
    }

    long startTime = System.currentTimeMillis();
    File file = new File(collectionPath);
    if (!file.exists()) {
        System.err.println("Error: " + file + " does not exist!");
        System.exit(-1);
    }

    StatusStream stream = new JsonStatusCorpusReader(file);

    Directory dir = FSDirectory.open(new File(indexPath));
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, IndexStatuses.ANALYZER);
    config.setOpenMode(OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);
    int cnt = 0;
    Status status;
    try {
        while ((status = stream.next()) != null) {
            if (status.getText() == null) {
                continue;
            }

            // Skip deletes tweetids.
            if (deletes != null && deletes.contains(status.getId())) {
                continue;
            }

            if (status.getId() > maxId) {
                continue;
            }

            cnt++;
            Document doc = new Document();
            doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES));
            doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES));
            doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));

            doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions));

            doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES));
            doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES));
            doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES));

            long inReplyToStatusId = status.getInReplyToStatusId();
            if (inReplyToStatusId > 0) {
                doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId,
                        Field.Store.YES));
                doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(),
                        Field.Store.YES));
            }

            String lang = status.getLang();
            if (!lang.equals("unknown")) {
                doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
            }

            long retweetStatusId = status.getRetweetedStatusId();
            if (retweetStatusId > 0) {
                doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES));
                doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(),
                        Field.Store.YES));
                doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES));
                if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
                    LOG.warn("Error parsing retweet fields of " + status.getId());
                }
            }

            writer.addDocument(doc);
            if (cnt % 100000 == 0) {
                LOG.info(cnt + " statuses indexed");
            }
        }

        LOG.info(String.format("Total of %s statuses added", cnt));

        if (cmdline.hasOption(OPTIMIZE_OPTION)) {
            LOG.info("Merging segments...");
            writer.forceMerge(1);
            LOG.info("Done!");
        }

        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        stream.close();
    }
}

From source file:cc.wikitools.lucene.IndexWikipediaDump.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("bz2 Wikipedia XML dump file")
            .create(INPUT_OPTION));// w  ww  .  j  a  v  a 2  s .co m
    options.addOption(
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg()
            .withDescription("maximum number of documents to index").create(MAX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexing threads")
            .create(THREADS_OPTION));

    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(IndexWikipediaDump.class.getCanonicalName(), options);
        System.exit(-1);
    }

    String indexPath = cmdline.getOptionValue(INDEX_OPTION);
    int maxdocs = cmdline.hasOption(MAX_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_OPTION))
            : Integer.MAX_VALUE;
    int threads = cmdline.hasOption(THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION))
            : DEFAULT_NUM_THREADS;

    long startTime = System.currentTimeMillis();

    String path = cmdline.getOptionValue(INPUT_OPTION);
    PrintStream out = new PrintStream(System.out, true, "UTF-8");
    WikiClean cleaner = new WikiCleanBuilder().withTitle(true).build();

    Directory dir = FSDirectory.open(new File(indexPath));
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER);
    config.setOpenMode(OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);
    LOG.info("Creating index at " + indexPath);
    LOG.info("Indexing with " + threads + " threads");

    try {
        WikipediaBz2DumpInputStream stream = new WikipediaBz2DumpInputStream(path);

        ExecutorService executor = Executors.newFixedThreadPool(threads);
        int cnt = 0;
        String page;
        while ((page = stream.readNext()) != null) {
            String title = cleaner.getTitle(page);

            // These are heuristic specifically for filtering out non-articles in enwiki-20120104.
            if (title.startsWith("Wikipedia:") || title.startsWith("Portal:") || title.startsWith("File:")) {
                continue;
            }

            if (page.contains("#REDIRECT") || page.contains("#redirect") || page.contains("#Redirect")) {
                continue;
            }

            Runnable worker = new AddDocumentRunnable(writer, cleaner, page);
            executor.execute(worker);

            cnt++;
            if (cnt % 10000 == 0) {
                LOG.info(cnt + " articles added");
            }
            if (cnt >= maxdocs) {
                break;
            }
        }

        executor.shutdown();
        // Wait until all threads are finish
        while (!executor.isTerminated()) {
        }

        LOG.info("Total of " + cnt + " articles indexed.");

        if (cmdline.hasOption(OPTIMIZE_OPTION)) {
            LOG.info("Merging segments...");
            writer.forceMerge(1);
            LOG.info("Done!");
        }

        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        out.close();
    }
}

From source file:ch.admin.isb.hermes5.business.search.SearchEngine.java

License:Apache License

public IndexWriterWrapper startIndexing(String modelIdentifier, LocalizationEngine localizationEngine) {
    try {/*from  ww  w  .ja  va2  s .co m*/
        String language = localizationEngine.getLanguage();
        Analyzer analyzer = analyserRepository.getAnalyzer(language);
        String path = searchIndexManager.getSearchIndexPath(modelIdentifier, language);

        searchIndexManager.cleanOldIndices(modelIdentifier, language);

        Directory directory = FSDirectory.open(new File(path));
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer);
        IndexWriter iwriter = new IndexWriter(directory, config);
        return new IndexWriterWrapper(iwriter, localizationEngine);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

}

From source file:ch.algotrader.rest.index.SecurityIndexer.java

License:Open Source License

private void buildIndex(Collection<Security> securities) {
    Analyzer analyzer = new StandardAnalyzer();
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    try (IndexWriter iwriter = new IndexWriter(index, config)) {
        Collection<Collection<IndexableField>> securityDocuments = securities.stream()
                .map(Security::convertToVO).map(this::createDocument).collect(Collectors.toList());

        iwriter.addDocuments(securityDocuments);
    } catch (IOException ex) {
        throw new UnrecoverableCoreException("Unexpected I/O error building security index", ex);
    }//from  w  w  w .j  a va2s .c o  m
}

From source file:ch.ksfx.web.services.lucene.AsynchronousIndexer.java

License:Open Source License

public void run() {
    while (true) {
        try {//from   w w  w  . j  a  v  a  2  s  . co m
            if (writer == null || !writer.isOpen()) {
                IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
                iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
                writer = new IndexWriter(dir, iwc);
            }

            IndexEvent indexEvent = queuedIndexEvents.take();

            String sortableDateTime = null;

            if (indexEvent.getDeleteEvent()) {
                if (indexEvent instanceof DeleteSeriesObservationsEvent) {
                    writer.deleteDocuments(new Term("series_id", indexEvent.getSeriesId().toString()));
                } else {
                    sortableDateTime = DateFormatUtil.formatToLexicographicallySortableTimeAndDateString(
                            indexEvent.getObservationTime());
                    writer.deleteDocuments(new Term("internal_id",
                            indexEvent.getSeriesId().toString() + sortableDateTime + indexEvent.getSourceId()));
                }

                if (queuedIndexEvents.size() == 0) {
                    writer.close();
                }

                continue;
            }

            Observation obs = observationDAO.getObservationForTimeSeriesIdObservationTimeAndSourceId(
                    indexEvent.getSeriesId(), indexEvent.getObservationTime(), indexEvent.getSourceId());

            Document doc = new Document();

            sortableDateTime = DateFormatUtil
                    .formatToLexicographicallySortableTimeAndDateString(obs.getObservationTime());
            String isoDateTime = DateFormatUtil.formatToISO8601TimeAndDateString(obs.getObservationTime());

            doc.add(new StringField("internal_id",
                    obs.getTimeSeriesId().toString() + sortableDateTime + obs.getSourceId(), Field.Store.NO));
            doc.add(new StringField("series_id", obs.getTimeSeriesId().toString(), Field.Store.YES));
            doc.add(new StringField("observation_time", isoDateTime, Field.Store.YES));
            doc.add(new StringField("sortable_observation_time", sortableDateTime, Field.Store.NO));
            doc.add(new StringField("source_id", obs.getSourceId(), Field.Store.YES));

            addField(doc, "source_uri", obs.getSourceId());
            addField(doc, "scalar_value", obs.getScalarValue());

            for (String key : obs.getComplexValue().keySet()) {
                addField(doc, key, obs.getComplexValue().get(key));
            }

            //System.out.println("Meta data to index: " + obs.getMetaData() + " Size: " + obs.getMetaData().size());
            for (String key : obs.getMetaData().keySet()) {
                System.out.println("Indexing meta data: " + key + " --> " + obs.getMetaData().get(key));
                addField(doc, key, obs.getMetaData().get(key));
            }

            writer.updateDocument(new Term("internal_id",
                    obs.getTimeSeriesId().toString() + sortableDateTime + obs.getSourceId()), doc);

            if (queuedIndexEvents.size() == 0) {
                writer.close();
            }
        } catch (Exception e) {
            logger.error("Error while Asynchronous Indexing", e);
        }
    }
}

From source file:choco.lucene.IKAnalyzerDemo.java

License:Apache License

public static void main(String[] args) {
    //Lucene Document?? 
    String fieldName = "text";
    // //from ww w  . j av  a2s .c  om
    String text = "IK Analyzer???????";
    //IKAnalyzer? 
    Analyzer analyzer = new IKAnalyzer();
    Directory directory = null;
    IndexWriter iwriter = null;
    IndexReader ireader = null;
    IndexSearcher isearcher = null;
    try {
        // 
        directory = new RAMDirectory();
        //?IndexWriterConfig 
        IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_34, analyzer);
        iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
        iwriter = new IndexWriter(directory, iwConfig);
        //          
        Document doc = new Document();
        doc.add(new Field("ID", "10000", Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field(fieldName, text, Field.Store.YES, Field.Index.ANALYZED));
        iwriter.addDocument(doc);
        iwriter.close();
        //?********************************** 
        //? 
        ireader = IndexReader.open(directory);
        isearcher = new IndexSearcher(ireader);
        String keyword = "?";
        //QueryParser?Query 
        QueryParser qp = new QueryParser(Version.LUCENE_34, fieldName, analyzer);
        qp.setDefaultOperator(QueryParser.AND_OPERATOR);
        Query query = qp.parse(keyword);
        //?5? 
        TopDocs topDocs = isearcher.search(query, 5);
        System.out.println("" + topDocs.totalHits);
        // 
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (int i = 0; i < topDocs.totalHits; i++) {
            Document targetDoc = isearcher.doc(scoreDocs[i].doc);
            System.out.println("" + targetDoc.toString());
        }
    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (LockObtainFailedException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (ParseException e) {
        e.printStackTrace();
    } finally {
        if (ireader != null) {
            try {
                ireader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        if (directory != null) {
            try {
                directory.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}