Example usage for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException

Source Link

Document

Adds a document to this index.

Usage

From source file:aos.lucene.tools.BerkeleyDbJEIndexer.java

License:Apache License

public static void main(String[] args) throws IOException, DatabaseException {
    if (args.length != 1) {
        System.err.println("Usage: BerkeleyDbIndexer <index dir>");
        System.exit(-1);/*from w  w w  . j  a v  a 2s .c  om*/
    }

    File indexFile = new File(args[0]);

    if (indexFile.exists()) {
        File[] files = indexFile.listFiles();
        for (int i = 0; i < files.length; i++)

            if (files[i].getName().startsWith("__"))
                files[i].delete();
        indexFile.delete();
    }

    indexFile.mkdir();

    EnvironmentConfig envConfig = new EnvironmentConfig();
    DatabaseConfig dbConfig = new DatabaseConfig();

    envConfig.setTransactional(true);
    envConfig.setAllowCreate(true);
    dbConfig.setTransactional(true);
    dbConfig.setAllowCreate(true);

    Environment env = new Environment(indexFile, envConfig);

    Transaction txn = env.beginTransaction(null, null);
    Database index = env.openDatabase(txn, "__index__", dbConfig);
    Database blocks = env.openDatabase(txn, "__blocks__", dbConfig);
    txn.commit();
    txn = env.beginTransaction(null, null);

    JEDirectory directory = new JEDirectory(txn, index, blocks);

    IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_46), true,
            IndexWriter.MaxFieldLength.UNLIMITED);

    Document doc = new Document();
    doc.add(new Field("contents", "The quick brown fox...", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(doc);

    writer.merge(writer.getNextMerge());
    writer.close();

    directory.close();
    txn.commit();

    index.close();
    blocks.close();
    env.close();

    LOGGER.info("Indexing Complete");
}

From source file:aos.lucene.tools.ChainedFilterTest.java

License:Apache License

@Override
public void setUp() throws Exception {

    directory = new RAMDirectory();

    IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(Version.LUCENE_46),
            IndexWriter.MaxFieldLength.UNLIMITED);

    Calendar cal = Calendar.getInstance();
    cal.set(2009, 1, 1, 0, 0);//from  w w w  . java2 s .co  m

    for (int i = 0; i < MAX; i++) {
        Document doc = new Document();
        doc.add(new Field("key", "" + (i + 1), Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("owner", (i < MAX / 2) ? "bob" : "sue", Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("date", DateTools.timeToString(cal.getTimeInMillis(), DateTools.Resolution.DAY),
                Field.Store.YES, Field.Index.NOT_ANALYZED));
        writer.addDocument(doc);

        cal.add(Calendar.DATE, 1);
    }

    writer.close();

    searcher = new IndexSearcher(directory);

    BooleanQuery bq = new BooleanQuery();
    bq.add(new TermQuery(new Term("owner", "bob")), BooleanClause.Occur.SHOULD);
    bq.add(new TermQuery(new Term("owner", "sue")), BooleanClause.Occur.SHOULD);
    query = bq;

    cal.set(2099, 1, 1, 0, 0);
    dateFilter = TermRangeFilter.Less("date",
            DateTools.timeToString(cal.getTimeInMillis(), DateTools.Resolution.DAY));// C

    bobFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("owner", "bob"))));

    sueFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("owner", "sue"))));
}

From source file:aos.lucene.tools.FastVectorHighlighterSample.java

License:Apache License

static void makeIndex() throws IOException {
    IndexWriter writer = new IndexWriter(dir, analyzer, true, MaxFieldLength.UNLIMITED);
    for (String d : DOCS) {
        Document doc = new Document();
        doc.add(new Field(F, d, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
        writer.addDocument(doc);
    }//w  ww .j av a 2s  . c  o  m
    writer.close();
}

From source file:aos.lucene.tools.SpatialLuceneExample.java

License:Apache License

private void addLocation(IndexWriter writer, String name, double lat, double lng) throws IOException {

    Document doc = new Document();
    doc.add(new Field("name", name, Field.Store.YES, Field.Index.ANALYZED));

    doc.add(new Field(latField, NumericUtils.doubleToPrefixCoded(lat), // #A
            Field.Store.YES, Field.Index.NOT_ANALYZED)); // #A
    doc.add(new Field(lngField, NumericUtils.doubleToPrefixCoded(lng), // #A
            Field.Store.YES, Field.Index.NOT_ANALYZED)); // #A

    doc.add(new Field("metafile", "doc", Field.Store.YES, Field.Index.ANALYZED));

    IProjector projector = new SinusoidalProjector(); // #B

    int startTier = 5; // #C
    int endTier = 15; // #C

    for (; startTier <= endTier; startTier++) {
        CartesianTierPlotter ctp;/*from w  w  w. ja va 2  s.c o m*/
        ctp = new CartesianTierPlotter(startTier, // #D
                projector, tierPrefix); // #D

        double boxId = ctp.getTierBoxId(lat, lng); // #D
        LOGGER.info("Adding field " + ctp.getTierFieldName() + ":" + boxId);
        doc.add(new Field(ctp.getTierFieldName(), NumericUtils // #E
                .doubleToPrefixCoded(boxId), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));

    }

    writer.addDocument(doc);
    LOGGER.info("===== Added Doc to index ====");
}

From source file:api.startup.PDFIndexer.java

License:Open Source License

/**
 * Indexes a single document and writes it to the given index writer
 * @param writer - the index writer to writer
 * @param metadata - the document//  w  w  w . j a  v  a 2s  .  c  o m
 * @throws IOException
 */
static void indexDoc(IndexWriter writer, DocumentMetadata metadata) throws IOException {
    Path file = Paths.get(metadata.getFilename());
    try {
        Document doc = new Document();

        Field pathField = new StringField(Constants.FIELD_PATH, file.toString(), Field.Store.YES);
        doc.add(pathField);

        // Add Document metadata //
        doc.add(new StringField(Constants.FIELD_AUTHOR, metadata.getAuthor(), Field.Store.YES));
        doc.add(new StringField(Constants.FIELD_TITLE, metadata.getTitle(), Field.Store.YES));
        doc.add(new StringField(Constants.FIELD_CONFERENCE, metadata.getConference(), Field.Store.YES));
        // End of Document Metadata //

        Field modified = new LongField(Constants.FIELD_MODIFIED, Files.getLastModifiedTime(file).toMillis(),
                Field.Store.YES);
        doc.add(modified);

        PDFTextExtractor extractor = new PDFTextExtractor();
        // Get the string contents
        String textContents = extractor.extractText(file.toString());

        // Store the string contents
        FieldType contentsType = new FieldType();
        contentsType.setStored(true);
        contentsType.setTokenized(true);
        contentsType.setStoreTermVectors(true);
        contentsType.setStoreTermVectorPositions(true);
        contentsType.setStoreTermVectorPayloads(true);
        contentsType.setStoreTermVectorOffsets(true);
        contentsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        Field contents = new Field(Constants.FIELD_CONTENTS, textContents, contentsType);
        doc.add(contents);

        if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            log.info("adding " + file + " to index");
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so
            // we use updateDocument instead to replace the old one matching the exact
            // path, if present:
            log.info("updating " + file + " in index");
            writer.updateDocument(new Term(Constants.FIELD_PATH, file.toString()), doc);
        }
    } catch (IOException e) {
        log.error("Failed to read file " + metadata.getFilename());
    }

}

From source file:aplicacion.sistema.indexer.test.IndexFiles.java

License:Apache License

static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }//from ww  w  .  j  a va 2  s  .  c o m
            }
        } else {
            System.out.println("adding " + file);
            try {
                writer.addDocument(FileDocument.Document(file));
            }
            // at least on windows, some temporary files raise this exception with an "access denied" message
            // checking if the file can be read doesn't help
            catch (FileNotFoundException fnfe) {
                ;
            }
        }
    }
}

From source file:Application.mediaIndexer.java

/**
 * Indexes a single document//from  ww w.  ja  v a  2 s. c  o  m
 * 
 * @throws TikaException
 * @throws SAXException
 */
public static void indexDoc(IndexWriter writer, Path file, TextArea results, long lastModified)
        throws IOException, SAXException, TikaException {
    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = Files.newInputStream(file)) {
        parser.parse(stream, handler, metadata);
        Document doc = new Document();
        String[] metadataNames = metadata.names();
        for (String name : metadataNames)
            doc.add(new TextField(name, metadata.get(name), Field.Store.YES));
        doc.add(new StringField("path", file.toString(), Field.Store.YES));
        doc.add(new LongPoint("modified", lastModified));
        results.appendText("Title: " + metadata.get("title") + "\n");
        results.appendText("Artists: " + metadata.get("xmpDM:artist") + "\n");
        results.appendText("Genre: " + metadata.get("xmpDM:genre") + "\n");
        results.appendText("Year: " + metadata.get("xmpDM:releaseDate") + "\n");
        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can
            // be there):
            results.appendText("adding " + file + "\n");
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been
            // indexed):
            results.appendText("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:application.ReviewDocumentIndexer.java

License:Open Source License

/**
 * @param args/*ww  w  . jav  a  2  s  .c om*/
 */
@SuppressWarnings("deprecation")
public static void main(String[] args) {
    // Parse command line arguments. Exit program is provided arguments are insufficient
    ReviewDocumentIndexer indexer = new ReviewDocumentIndexer(args);
    if (indexer == null)
        return;

    // Open a new index
    IndexWriter index = null;
    try {
        index = new IndexWriter(new SimpleFSDirectory(new File(Paths.luceneIndex)),
                new ReviewTextAnalyzer(indexer), indexer.new_index ? true : false, MaxFieldLength.UNLIMITED);
        if (indexer.pause_every > 2) {
            index.setMaxBufferedDocs(indexer.pause_every);
        }
        index.setMaxMergeDocs(Config.maxMergeDocs);
        index.setMergeFactor(Config.mergeFactor);
    } catch (CorruptIndexException e) {
        AppLogger.error.log(Level.SEVERE,
                "Lucene detected an inconsistency upon opening the index located at " + Paths.luceneIndex);
        throw new RuntimeException("Exiting application", e);
    } catch (LockObtainFailedException e) {
        AppLogger.error.log(Level.SEVERE,
                "Index located at " + Paths.luceneIndex + " is already open by another Lucene process");
        throw new RuntimeException("Exiting application", e);
    } catch (IOException e) {
        AppLogger.error.log(Level.SEVERE, "Could not access location " + Paths.luceneIndex);
        throw new RuntimeException("Exiting application", e);
    }

    // Load a number of reviews from database
    NumberFormat docIdFormat = TokenListsCollector.defaultDocIdFormat();
    try {
        DatabaseReviewCollection reviews = new DatabaseReviewCollection(indexer.pause_every);
        reviews.setLimits(indexer.min_reviewid, indexer.stop_after);
        int indexed_counter = 0;

        while (reviews.hasNextSegment()) {

            System.out.print(Calendar.getInstance().getTime().toGMTString());

            System.out.print(" Loading from DB... ");
            reviews.loadNextSegment();
            Iterator<Review> reviewsIterator = reviews.getIterator();

            System.out.print(" Indexing... ");
            while (reviewsIterator.hasNext()) {
                DatabaseReview dbr = (DatabaseReview) reviewsIterator.next();
                int dbr_id = dbr.getReviewid();
                int dbr_rating = dbr.getRating();

                try {
                    indexer.theReviewId.set(dbr_id);
                    indexer.theStats.setCurrent(dbr_id, dbr_rating);

                    index.addDocument(dbr.getDocumentForIndexing());
                    indexed_counter++;

                    // Also, keep track of the rating and length of this review
                    indexer.theStats.storeCurrent();

                } catch (CorruptIndexException e) {
                    AppLogger.error.log(Level.SEVERE,
                            "Lucene detected an inconsistency upon saving review #"
                                    + Integer.toString(dbr.getReviewid()) + "to the index located at "
                                    + Paths.luceneIndex);
                    return;
                } catch (IOException e) {
                    AppLogger.error.log(Level.WARNING,
                            "Review #" + Integer.toString(dbr.getReviewid()) + " could not be indexed");
                }
            }

            // Backup everything
            System.out.print("Indexed " + indexed_counter + " reviews total. ");
            if (indexer.pause_every > 0) {
                System.out.print("Saving tokenlists... ");
                indexer.theTokenLists.writeNextFile(docIdFormat);

                System.out.print("Saving state... ");
                try {
                    index.commit();
                    indexer.saveState();
                } catch (CorruptIndexException e) {
                    AppLogger.error.log(Level.SEVERE, "Committing index changes failed on review #"
                            + indexer.theReviewId.get() + "due to CorruptIndexException");
                    return;
                } catch (IOException e) {
                    AppLogger.error.log(Level.WARNING, "Committing index changes failed on review #"
                            + indexer.theReviewId.get() + "due to IOException");
                }
            }
            System.out.print("DONE\n");

            reviews.reset();
        }
    } catch (SQLException e) {
        AppLogger.error.log(Level.SEVERE,
                "An exception occured while trying to access the database.\n" + e.getMessage());
        return;
    }

    try {
        index.close();
        indexer.backupIndex();
    } catch (CorruptIndexException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    System.err.println("Indexing successfully completed!");
    return;
}

From source file:apps.LuceneIndexer.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();

    options.addOption("i", null, true, "input file");
    options.addOption("o", null, true, "output directory");
    options.addOption("r", null, true, "optional output TREC-format QREL file");

    options.addOption("bm25_b", null, true, "BM25 parameter: b");
    options.addOption("bm25_k1", null, true, "BM25 parameter: k1");
    options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity");

    Joiner commaJoin = Joiner.on(',');
    Joiner spaceJoin = Joiner.on(' ');

    options.addOption("source_type", null, true,
            "document source type: " + commaJoin.join(SourceFactory.getDocSourceList()));

    // If you increase this value, you may need to modify the following line in *.sh file
    // export MAVEN_OPTS="-Xms8192m -server"
    double ramBufferSizeMB = 1024 * 8; // 8 GB

    CommandLineParser parser = new org.apache.commons.cli.GnuParser();

    IndexWriter indexWriter = null;
    BufferedWriter qrelWriter = null;

    int docNum = 0;

    try {/* w  w  w .  j a va  2 s .  c  o m*/
        CommandLine cmd = parser.parse(options, args);

        String inputFileName = null, outputDirName = null, qrelFileName = null;

        if (cmd.hasOption("i")) {
            inputFileName = cmd.getOptionValue("i");
        } else {
            Usage("Specify 'input file'", options);
        }

        if (cmd.hasOption("o")) {
            outputDirName = cmd.getOptionValue("o");
        } else {
            Usage("Specify 'index directory'", options);
        }

        if (cmd.hasOption("r")) {
            qrelFileName = cmd.getOptionValue("r");
        }

        String sourceName = cmd.getOptionValue("source_type");

        if (sourceName == null)
            Usage("Specify document source type", options);

        if (qrelFileName != null)
            qrelWriter = new BufferedWriter(new FileWriter(qrelFileName));

        File outputDir = new File(outputDirName);
        if (!outputDir.exists()) {
            if (!outputDir.mkdirs()) {
                System.out.println("couldn't create " + outputDir.getAbsolutePath());
                System.exit(1);
            }
        }
        if (!outputDir.isDirectory()) {
            System.out.println(outputDir.getAbsolutePath() + " is not a directory!");
            System.exit(1);
        }
        if (!outputDir.canWrite()) {
            System.out.println("Can't write to " + outputDir.getAbsolutePath());
            System.exit(1);
        }

        boolean useFixedBM25 = cmd.hasOption("bm25fixed");

        float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT;

        if (cmd.hasOption("bm25_k1")) {
            try {
                bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_k1'", options);
            }
        }

        if (cmd.hasOption("bm25_b")) {
            try {
                bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_b'", options);
            }
        }

        EnglishAnalyzer analyzer = new EnglishAnalyzer();
        FSDirectory indexDir = FSDirectory.open(Paths.get(outputDirName));
        IndexWriterConfig indexConf = new IndexWriterConfig(analyzer);

        /*
            OpenMode.CREATE creates a new index or overwrites an existing one.
            https://lucene.apache.org/core/6_0_0/core/org/apache/lucene/index/IndexWriterConfig.OpenMode.html#CREATE
        */
        indexConf.setOpenMode(OpenMode.CREATE);
        indexConf.setRAMBufferSizeMB(ramBufferSizeMB);

        System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b));

        if (useFixedBM25) {
            System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25SimilarityFix(bm25_k1, bm25_b));
        } else {
            System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25Similarity(bm25_k1, bm25_b));
        }

        indexWriter = new IndexWriter(indexDir, indexConf);

        DocumentSource inpDocSource = SourceFactory.createDocumentSource(sourceName, inputFileName);
        DocumentEntry inpDoc = null;
        TextCleaner textCleaner = new TextCleaner(null);

        while ((inpDoc = inpDocSource.next()) != null) {
            ++docNum;

            Document luceneDoc = new Document();
            ArrayList<String> cleanedToks = textCleaner.cleanUp(inpDoc.mDocText);
            String cleanText = spaceJoin.join(cleanedToks);

            //        System.out.println(inpDoc.mDocId);
            //        System.out.println(cleanText);
            //        System.out.println("==============================");

            luceneDoc.add(new StringField(UtilConst.FIELD_ID, inpDoc.mDocId, Field.Store.YES));
            luceneDoc.add(new TextField(UtilConst.FIELD_TEXT, cleanText, Field.Store.YES));
            indexWriter.addDocument(luceneDoc);

            if (inpDoc.mIsRel != null && qrelWriter != null) {
                saveQrelOneEntry(qrelWriter, inpDoc.mQueryId, inpDoc.mDocId, inpDoc.mIsRel ? MAX_GRADE : 0);
            }
            if (docNum % 1000 == 0)
                System.out.println(String.format("Indexed %d documents", docNum));

        }

    } catch (ParseException e) {
        e.printStackTrace();
        Usage("Cannot parse arguments" + e, options);
    } catch (Exception e) {
        System.err.println("Terminating due to an exception: " + e);
        System.exit(1);
    } finally {
        System.out.println(String.format("Indexed %d documents", docNum));

        try {
            if (null != indexWriter)
                indexWriter.close();
            if (null != qrelWriter)
                qrelWriter.close();
        } catch (IOException e) {
            System.err.println("IO exception: " + e);
            e.printStackTrace();
        }
    }
}

From source file:arena.lucene.LuceneIndexUpdater.java

License:Open Source License

public int updateIndex(boolean deleteAllFromIndexFirst, Iterable<T> valueobjects) {
    IndexWriter writer = null;
    try {/* w  w  w. ja  va2  s. c om*/
        writer = new IndexWriter(directoryBean.getDirectory(), analyzer, deleteAllFromIndexFirst,
                MaxFieldLength.LIMITED);
        int docCount = 0;
        for (T vo : valueobjects) {
            Term pkTerm = this.contentMarshall.getPKTerm(vo);
            writer.deleteDocuments(pkTerm);

            Document doc = this.contentMarshall.serialize(vo);
            if (doc != null) {
                writer.addDocument(doc);
                docCount++;
            }
        }
        if (this.searchersToReset != null) {
            for (LuceneIndexSearcher<?> searcher : this.searchersToReset) {
                searcher.reset();
            }
        }
        return docCount;
    } catch (IOException err) {
        throw new RuntimeException("Error deleting documents from lucene index", err);
    } finally {
        if (writer != null) {
            try {
                writer.close();
            } catch (IOException err) {
            }
        }
    }
}