Example usage for org.apache.lucene.index IndexWriter addDocument

List of usage examples for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException 

Source Link

Document

Adds a document to this index.

Usage

From source file:aos.lucene.tools.BerkeleyDbJEIndexer.java

License:Apache License

public static void main(String[] args) throws IOException, DatabaseException {
    if (args.length != 1) {
        System.err.println("Usage: BerkeleyDbIndexer <index dir>");
        System.exit(-1);/*from w  w w  . j  a v  a 2s .c  om*/
    }

    File indexFile = new File(args[0]);

    if (indexFile.exists()) {
        File[] files = indexFile.listFiles();
        for (int i = 0; i < files.length; i++)

            if (files[i].getName().startsWith("__"))
                files[i].delete();
        indexFile.delete();
    }

    indexFile.mkdir();

    EnvironmentConfig envConfig = new EnvironmentConfig();
    DatabaseConfig dbConfig = new DatabaseConfig();

    envConfig.setTransactional(true);
    envConfig.setAllowCreate(true);
    dbConfig.setTransactional(true);
    dbConfig.setAllowCreate(true);

    Environment env = new Environment(indexFile, envConfig);

    Transaction txn = env.beginTransaction(null, null);
    Database index = env.openDatabase(txn, "__index__", dbConfig);
    Database blocks = env.openDatabase(txn, "__blocks__", dbConfig);
    txn.commit();
    txn = env.beginTransaction(null, null);

    JEDirectory directory = new JEDirectory(txn, index, blocks);

    IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_46), true,
            IndexWriter.MaxFieldLength.UNLIMITED);

    Document doc = new Document();
    doc.add(new Field("contents", "The quick brown fox...", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(doc);

    writer.merge(writer.getNextMerge());
    writer.close();

    directory.close();
    txn.commit();

    index.close();
    blocks.close();
    env.close();

    LOGGER.info("Indexing Complete");
}

From source file:aos.lucene.tools.ChainedFilterTest.java

License:Apache License

@Override
public void setUp() throws Exception {

    directory = new RAMDirectory();

    IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(Version.LUCENE_46),
            IndexWriter.MaxFieldLength.UNLIMITED);

    Calendar cal = Calendar.getInstance();
    cal.set(2009, 1, 1, 0, 0);//from  w w w  . java2 s .co  m

    for (int i = 0; i < MAX; i++) {
        Document doc = new Document();
        doc.add(new Field("key", "" + (i + 1), Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("owner", (i < MAX / 2) ? "bob" : "sue", Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("date", DateTools.timeToString(cal.getTimeInMillis(), DateTools.Resolution.DAY),
                Field.Store.YES, Field.Index.NOT_ANALYZED));
        writer.addDocument(doc);

        cal.add(Calendar.DATE, 1);
    }

    writer.close();

    searcher = new IndexSearcher(directory);

    BooleanQuery bq = new BooleanQuery();
    bq.add(new TermQuery(new Term("owner", "bob")), BooleanClause.Occur.SHOULD);
    bq.add(new TermQuery(new Term("owner", "sue")), BooleanClause.Occur.SHOULD);
    query = bq;

    cal.set(2099, 1, 1, 0, 0);
    dateFilter = TermRangeFilter.Less("date",
            DateTools.timeToString(cal.getTimeInMillis(), DateTools.Resolution.DAY));// C

    bobFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("owner", "bob"))));

    sueFilter = new CachingWrapperFilter(new QueryWrapperFilter(new TermQuery(new Term("owner", "sue"))));
}

From source file:aos.lucene.tools.FastVectorHighlighterSample.java

License:Apache License

static void makeIndex() throws IOException {
    IndexWriter writer = new IndexWriter(dir, analyzer, true, MaxFieldLength.UNLIMITED);
    for (String d : DOCS) {
        Document doc = new Document();
        doc.add(new Field(F, d, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
        writer.addDocument(doc);
    }//w  ww .j av a 2s  . c  o  m
    writer.close();
}

From source file:aos.lucene.tools.SpatialLuceneExample.java

License:Apache License

private void addLocation(IndexWriter writer, String name, double lat, double lng) throws IOException {

    Document doc = new Document();
    doc.add(new Field("name", name, Field.Store.YES, Field.Index.ANALYZED));

    doc.add(new Field(latField, NumericUtils.doubleToPrefixCoded(lat), // #A
            Field.Store.YES, Field.Index.NOT_ANALYZED)); // #A
    doc.add(new Field(lngField, NumericUtils.doubleToPrefixCoded(lng), // #A
            Field.Store.YES, Field.Index.NOT_ANALYZED)); // #A

    doc.add(new Field("metafile", "doc", Field.Store.YES, Field.Index.ANALYZED));

    IProjector projector = new SinusoidalProjector(); // #B

    int startTier = 5; // #C
    int endTier = 15; // #C

    for (; startTier <= endTier; startTier++) {
        CartesianTierPlotter ctp;/*from w  w  w. ja va 2  s.c o m*/
        ctp = new CartesianTierPlotter(startTier, // #D
                projector, tierPrefix); // #D

        double boxId = ctp.getTierBoxId(lat, lng); // #D
        LOGGER.info("Adding field " + ctp.getTierFieldName() + ":" + boxId);
        doc.add(new Field(ctp.getTierFieldName(), NumericUtils // #E
                .doubleToPrefixCoded(boxId), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS));

    }

    writer.addDocument(doc);
    LOGGER.info("===== Added Doc to index ====");
}

From source file:api.startup.PDFIndexer.java

License:Open Source License

/**
 * Indexes a single document and writes it to the given index writer
 * @param writer - the index writer to writer
 * @param metadata - the document//  w  w  w . j a  v  a 2s  .  c  o m
 * @throws IOException
 */
static void indexDoc(IndexWriter writer, DocumentMetadata metadata) throws IOException {
    Path file = Paths.get(metadata.getFilename());
    try {
        Document doc = new Document();

        Field pathField = new StringField(Constants.FIELD_PATH, file.toString(), Field.Store.YES);
        doc.add(pathField);

        // Add Document metadata //
        doc.add(new StringField(Constants.FIELD_AUTHOR, metadata.getAuthor(), Field.Store.YES));
        doc.add(new StringField(Constants.FIELD_TITLE, metadata.getTitle(), Field.Store.YES));
        doc.add(new StringField(Constants.FIELD_CONFERENCE, metadata.getConference(), Field.Store.YES));
        // End of Document Metadata //

        Field modified = new LongField(Constants.FIELD_MODIFIED, Files.getLastModifiedTime(file).toMillis(),
                Field.Store.YES);
        doc.add(modified);

        PDFTextExtractor extractor = new PDFTextExtractor();
        // Get the string contents
        String textContents = extractor.extractText(file.toString());

        // Store the string contents
        FieldType contentsType = new FieldType();
        contentsType.setStored(true);
        contentsType.setTokenized(true);
        contentsType.setStoreTermVectors(true);
        contentsType.setStoreTermVectorPositions(true);
        contentsType.setStoreTermVectorPayloads(true);
        contentsType.setStoreTermVectorOffsets(true);
        contentsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
        Field contents = new Field(Constants.FIELD_CONTENTS, textContents, contentsType);
        doc.add(contents);

        if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            log.info("adding " + file + " to index");
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so
            // we use updateDocument instead to replace the old one matching the exact
            // path, if present:
            log.info("updating " + file + " in index");
            writer.updateDocument(new Term(Constants.FIELD_PATH, file.toString()), doc);
        }
    } catch (IOException e) {
        log.error("Failed to read file " + metadata.getFilename());
    }

}

From source file:aplicacion.sistema.indexer.test.IndexFiles.java

License:Apache License

static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }//from ww  w  .  j  a va 2  s  .  c o m
            }
        } else {
            System.out.println("adding " + file);
            try {
                writer.addDocument(FileDocument.Document(file));
            }
            // at least on windows, some temporary files raise this exception with an "access denied" message
            // checking if the file can be read doesn't help
            catch (FileNotFoundException fnfe) {
                ;
            }
        }
    }
}

From source file:Application.mediaIndexer.java

/**
 * Indexes a single document//from  ww w.  ja  v a  2 s. c  o  m
 * 
 * @throws TikaException
 * @throws SAXException
 */
public static void indexDoc(IndexWriter writer, Path file, TextArea results, long lastModified)
        throws IOException, SAXException, TikaException {
    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = Files.newInputStream(file)) {
        parser.parse(stream, handler, metadata);
        Document doc = new Document();
        String[] metadataNames = metadata.names();
        for (String name : metadataNames)
            doc.add(new TextField(name, metadata.get(name), Field.Store.YES));
        doc.add(new StringField("path", file.toString(), Field.Store.YES));
        doc.add(new LongPoint("modified", lastModified));
        results.appendText("Title: " + metadata.get("title") + "\n");
        results.appendText("Artists: " + metadata.get("xmpDM:artist") + "\n");
        results.appendText("Genre: " + metadata.get("xmpDM:genre") + "\n");
        results.appendText("Year: " + metadata.get("xmpDM:releaseDate") + "\n");
        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can
            // be there):
            results.appendText("adding " + file + "\n");
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been
            // indexed):
            results.appendText("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:application.ReviewDocumentIndexer.java

License:Open Source License

/**
 * @param args/*ww  w  . jav  a  2  s  .c om*/
 */
@SuppressWarnings("deprecation")
public static void main(String[] args) {
    // Parse command line arguments. Exit program is provided arguments are insufficient
    ReviewDocumentIndexer indexer = new ReviewDocumentIndexer(args);
    if (indexer == null)
        return;

    // Open a new index
    IndexWriter index = null;
    try {
        index = new IndexWriter(new SimpleFSDirectory(new File(Paths.luceneIndex)),
                new ReviewTextAnalyzer(indexer), indexer.new_index ? true : false, MaxFieldLength.UNLIMITED);
        if (indexer.pause_every > 2) {
            index.setMaxBufferedDocs(indexer.pause_every);
        }
        index.setMaxMergeDocs(Config.maxMergeDocs);
        index.setMergeFactor(Config.mergeFactor);
    } catch (CorruptIndexException e) {
        AppLogger.error.log(Level.SEVERE,
                "Lucene detected an inconsistency upon opening the index located at " + Paths.luceneIndex);
        throw new RuntimeException("Exiting application", e);
    } catch (LockObtainFailedException e) {
        AppLogger.error.log(Level.SEVERE,
                "Index located at " + Paths.luceneIndex + " is already open by another Lucene process");
        throw new RuntimeException("Exiting application", e);
    } catch (IOException e) {
        AppLogger.error.log(Level.SEVERE, "Could not access location " + Paths.luceneIndex);
        throw new RuntimeException("Exiting application", e);
    }

    // Load a number of reviews from database
    NumberFormat docIdFormat = TokenListsCollector.defaultDocIdFormat();
    try {
        DatabaseReviewCollection reviews = new DatabaseReviewCollection(indexer.pause_every);
        reviews.setLimits(indexer.min_reviewid, indexer.stop_after);
        int indexed_counter = 0;

        while (reviews.hasNextSegment()) {

            System.out.print(Calendar.getInstance().getTime().toGMTString());

            System.out.print(" Loading from DB... ");
            reviews.loadNextSegment();
            Iterator<Review> reviewsIterator = reviews.getIterator();

            System.out.print(" Indexing... ");
            while (reviewsIterator.hasNext()) {
                DatabaseReview dbr = (DatabaseReview) reviewsIterator.next();
                int dbr_id = dbr.getReviewid();
                int dbr_rating = dbr.getRating();

                try {
                    indexer.theReviewId.set(dbr_id);
                    indexer.theStats.setCurrent(dbr_id, dbr_rating);

                    index.addDocument(dbr.getDocumentForIndexing());
                    indexed_counter++;

                    // Also, keep track of the rating and length of this review
                    indexer.theStats.storeCurrent();

                } catch (CorruptIndexException e) {
                    AppLogger.error.log(Level.SEVERE,
                            "Lucene detected an inconsistency upon saving review #"
                                    + Integer.toString(dbr.getReviewid()) + "to the index located at "
                                    + Paths.luceneIndex);
                    return;
                } catch (IOException e) {
                    AppLogger.error.log(Level.WARNING,
                            "Review #" + Integer.toString(dbr.getReviewid()) + " could not be indexed");
                }
            }

            // Backup everything
            System.out.print("Indexed " + indexed_counter + " reviews total. ");
            if (indexer.pause_every > 0) {
                System.out.print("Saving tokenlists... ");
                indexer.theTokenLists.writeNextFile(docIdFormat);

                System.out.print("Saving state... ");
                try {
                    index.commit();
                    indexer.saveState();
                } catch (CorruptIndexException e) {
                    AppLogger.error.log(Level.SEVERE, "Committing index changes failed on review #"
                            + indexer.theReviewId.get() + "due to CorruptIndexException");
                    return;
                } catch (IOException e) {
                    AppLogger.error.log(Level.WARNING, "Committing index changes failed on review #"
                            + indexer.theReviewId.get() + "due to IOException");
                }
            }
            System.out.print("DONE\n");

            reviews.reset();
        }
    } catch (SQLException e) {
        AppLogger.error.log(Level.SEVERE,
                "An exception occured while trying to access the database.\n" + e.getMessage());
        return;
    }

    try {
        index.close();
        indexer.backupIndex();
    } catch (CorruptIndexException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    System.err.println("Indexing successfully completed!");
    return;
}

From source file:apps.LuceneIndexer.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();

    options.addOption("i", null, true, "input file");
    options.addOption("o", null, true, "output directory");
    options.addOption("r", null, true, "optional output TREC-format QREL file");

    options.addOption("bm25_b", null, true, "BM25 parameter: b");
    options.addOption("bm25_k1", null, true, "BM25 parameter: k1");
    options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity");

    Joiner commaJoin = Joiner.on(',');
    Joiner spaceJoin = Joiner.on(' ');

    options.addOption("source_type", null, true,
            "document source type: " + commaJoin.join(SourceFactory.getDocSourceList()));

    // If you increase this value, you may need to modify the following line in *.sh file
    // export MAVEN_OPTS="-Xms8192m -server"
    double ramBufferSizeMB = 1024 * 8; // 8 GB

    CommandLineParser parser = new org.apache.commons.cli.GnuParser();

    IndexWriter indexWriter = null;
    BufferedWriter qrelWriter = null;

    int docNum = 0;

    try {/* w  w  w .  j a va  2 s .  c  o m*/
        CommandLine cmd = parser.parse(options, args);

        String inputFileName = null, outputDirName = null, qrelFileName = null;

        if (cmd.hasOption("i")) {
            inputFileName = cmd.getOptionValue("i");
        } else {
            Usage("Specify 'input file'", options);
        }

        if (cmd.hasOption("o")) {
            outputDirName = cmd.getOptionValue("o");
        } else {
            Usage("Specify 'index directory'", options);
        }

        if (cmd.hasOption("r")) {
            qrelFileName = cmd.getOptionValue("r");
        }

        String sourceName = cmd.getOptionValue("source_type");

        if (sourceName == null)
            Usage("Specify document source type", options);

        if (qrelFileName != null)
            qrelWriter = new BufferedWriter(new FileWriter(qrelFileName));

        File outputDir = new File(outputDirName);
        if (!outputDir.exists()) {
            if (!outputDir.mkdirs()) {
                System.out.println("couldn't create " + outputDir.getAbsolutePath());
                System.exit(1);
            }
        }
        if (!outputDir.isDirectory()) {
            System.out.println(outputDir.getAbsolutePath() + " is not a directory!");
            System.exit(1);
        }
        if (!outputDir.canWrite()) {
            System.out.println("Can't write to " + outputDir.getAbsolutePath());
            System.exit(1);
        }

        boolean useFixedBM25 = cmd.hasOption("bm25fixed");

        float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT;

        if (cmd.hasOption("bm25_k1")) {
            try {
                bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_k1'", options);
            }
        }

        if (cmd.hasOption("bm25_b")) {
            try {
                bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_b'", options);
            }
        }

        EnglishAnalyzer analyzer = new EnglishAnalyzer();
        FSDirectory indexDir = FSDirectory.open(Paths.get(outputDirName));
        IndexWriterConfig indexConf = new IndexWriterConfig(analyzer);

        /*
            OpenMode.CREATE creates a new index or overwrites an existing one.
            https://lucene.apache.org/core/6_0_0/core/org/apache/lucene/index/IndexWriterConfig.OpenMode.html#CREATE
        */
        indexConf.setOpenMode(OpenMode.CREATE);
        indexConf.setRAMBufferSizeMB(ramBufferSizeMB);

        System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b));

        if (useFixedBM25) {
            System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25SimilarityFix(bm25_k1, bm25_b));
        } else {
            System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25Similarity(bm25_k1, bm25_b));
        }

        indexWriter = new IndexWriter(indexDir, indexConf);

        DocumentSource inpDocSource = SourceFactory.createDocumentSource(sourceName, inputFileName);
        DocumentEntry inpDoc = null;
        TextCleaner textCleaner = new TextCleaner(null);

        while ((inpDoc = inpDocSource.next()) != null) {
            ++docNum;

            Document luceneDoc = new Document();
            ArrayList<String> cleanedToks = textCleaner.cleanUp(inpDoc.mDocText);
            String cleanText = spaceJoin.join(cleanedToks);

            //        System.out.println(inpDoc.mDocId);
            //        System.out.println(cleanText);
            //        System.out.println("==============================");

            luceneDoc.add(new StringField(UtilConst.FIELD_ID, inpDoc.mDocId, Field.Store.YES));
            luceneDoc.add(new TextField(UtilConst.FIELD_TEXT, cleanText, Field.Store.YES));
            indexWriter.addDocument(luceneDoc);

            if (inpDoc.mIsRel != null && qrelWriter != null) {
                saveQrelOneEntry(qrelWriter, inpDoc.mQueryId, inpDoc.mDocId, inpDoc.mIsRel ? MAX_GRADE : 0);
            }
            if (docNum % 1000 == 0)
                System.out.println(String.format("Indexed %d documents", docNum));

        }

    } catch (ParseException e) {
        e.printStackTrace();
        Usage("Cannot parse arguments" + e, options);
    } catch (Exception e) {
        System.err.println("Terminating due to an exception: " + e);
        System.exit(1);
    } finally {
        System.out.println(String.format("Indexed %d documents", docNum));

        try {
            if (null != indexWriter)
                indexWriter.close();
            if (null != qrelWriter)
                qrelWriter.close();
        } catch (IOException e) {
            System.err.println("IO exception: " + e);
            e.printStackTrace();
        }
    }
}

From source file:arena.lucene.LuceneIndexUpdater.java

License:Open Source License

public int updateIndex(boolean deleteAllFromIndexFirst, Iterable<T> valueobjects) {
    IndexWriter writer = null;
    try {/* w  w  w. ja  va2  s. c om*/
        writer = new IndexWriter(directoryBean.getDirectory(), analyzer, deleteAllFromIndexFirst,
                MaxFieldLength.LIMITED);
        int docCount = 0;
        for (T vo : valueobjects) {
            Term pkTerm = this.contentMarshall.getPKTerm(vo);
            writer.deleteDocuments(pkTerm);

            Document doc = this.contentMarshall.serialize(vo);
            if (doc != null) {
                writer.addDocument(doc);
                docCount++;
            }
        }
        if (this.searchersToReset != null) {
            for (LuceneIndexSearcher<?> searcher : this.searchersToReset) {
                searcher.reset();
            }
        }
        return docCount;
    } catch (IOException err) {
        throw new RuntimeException("Error deleting documents from lucene index", err);
    } finally {
        if (writer != null) {
            try {
                writer.close();
            } catch (IOException err) {
            }
        }
    }
}