Example usage for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException

Source Link

Document

Adds a document to this index.

Usage

From source file:com.mathworks.xzheng.searching.PhraseQueryTest.java

License:Apache License

protected void setUp() throws IOException {
    dir = new RAMDirectory();

    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46,
            new WhitespaceAnalyzer(Version.LUCENE_46));

    IndexWriter writer = new IndexWriter(dir, config);
    Document doc = new Document();
    doc.add(new Field("field", // 1
            "the quick brown fox jumped over the lazy dog", // 1
            Field.Store.YES, // 1
            Field.Index.ANALYZED)); // 1
    writer.addDocument(doc);
    writer.close();/*from   ww w. j a va2  s  . c o m*/

    searcher = new IndexSearcher(DirectoryReader.open(dir));
}

From source file:com.mathworks.xzheng.searching.ScoreTest.java

License:Apache License

private void indexSingleFieldDocs(Field[] fields) throws Exception {
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46,
            new WhitespaceAnalyzer(Version.LUCENE_46));

    IndexWriter writer = new IndexWriter(directory, config);
    for (Field f : fields) {
        Document doc = new Document();
        doc.add(f);/*from   w  ww . j  a va2  s .  com*/
        writer.addDocument(doc);
    }
    writer.forceMerge(1);
    writer.close();
}

From source file:com.mathworks.xzheng.tools.BerkeleyDbIndexer.java

License:Apache License

public static void main(String[] args) throws IOException, DatabaseException {
    if (args.length != 1) {
        System.err.println("Usage: BerkeleyDbIndexer <index dir>");
        System.exit(-1);//ww w.  j a  v a2 s .c  o  m
    }

    File indexFile = new File(args[0]);

    if (indexFile.exists()) {
        File[] files = indexFile.listFiles();

        for (int i = 0; i < files.length; i++)
            if (files[i].getName().startsWith("__"))
                files[i].delete();
        indexFile.delete();
    }

    indexFile.mkdir();

    EnvironmentConfig envConfig = new EnvironmentConfig();
    DatabaseConfig dbConfig = new DatabaseConfig();

    envConfig.setTransactional(true);
    envConfig.setInitializeCache(true);
    envConfig.setInitializeLocking(true);
    envConfig.setInitializeLogging(true);
    envConfig.setAllowCreate(true);
    envConfig.setThreaded(true);
    dbConfig.setAllowCreate(true);
    dbConfig.setType(DatabaseType.BTREE);

    Environment env = new Environment(indexFile, envConfig);

    Transaction txn = env.beginTransaction(null, null);
    Database index = env.openDatabase(txn, "__index__", null, dbConfig);
    Database blocks = env.openDatabase(txn, "__blocks__", null, dbConfig);
    txn.commit();

    txn = env.beginTransaction(null, null);
    DbDirectory directory = new DbDirectory(txn, index, blocks);

    IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_46), true,
            IndexWriter.MaxFieldLength.UNLIMITED);

    Document doc = new Document();
    doc.add(new Field("contents", "The quick brown fox...", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(doc);

    writer.optimize();
    writer.close();

    directory.close();
    txn.commit();

    index.close();
    blocks.close();
    env.close();

    System.out.println("Indexing Complete");
}

From source file:com.mathworks.xzheng.tools.BerkeleyDbJEIndexer.java

License:Apache License

public static void main(String[] args) throws IOException, DatabaseException {
    if (args.length != 1) {
        System.err.println("Usage: BerkeleyDbIndexer <index dir>");
        System.exit(-1);//from w w  w  .j a va  2 s . c  o m
    }

    File indexFile = new File(args[0]);

    if (indexFile.exists()) { // A
        File[] files = indexFile.listFiles(); // A
        for (int i = 0; i < files.length; i++) // A
            if (files[i].getName().startsWith("__")) // A
                files[i].delete(); // A
        indexFile.delete(); // A
    }

    indexFile.mkdir();

    EnvironmentConfig envConfig = new EnvironmentConfig(); // B
    DatabaseConfig dbConfig = new DatabaseConfig(); // B

    envConfig.setTransactional(true); // B
    envConfig.setAllowCreate(true); // B
    dbConfig.setTransactional(true); // B
    dbConfig.setAllowCreate(true); // B

    Environment env = new Environment(indexFile, envConfig); // C

    Transaction txn = env.beginTransaction(null, null); // C
    Database index = env.openDatabase(txn, "__index__", dbConfig); // C
    Database blocks = env.openDatabase(txn, "__blocks__", dbConfig); // C
    txn.commit(); // C
    txn = env.beginTransaction(null, null); // C

    JEDirectory directory = new JEDirectory(txn, index, blocks); // D

    IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_46), true,
            IndexWriter.MaxFieldLength.UNLIMITED);

    Document doc = new Document();
    doc.add(new Field("contents", "The quick brown fox...", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(doc);

    writer.optimize();
    writer.close();

    directory.close();
    txn.commit();

    index.close();
    blocks.close();
    env.close();

    System.out.println("Indexing Complete");
}

From source file:com.mathworks.xzheng.tools.ChainedFilterTest.java

License:Apache License

public void setUp() throws Exception {
    directory = new RAMDirectory();

    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46,
            new WhitespaceAnalyzer(Version.LUCENE_46));

    IndexWriter writer = new IndexWriter(directory, config);

    Calendar cal = Calendar.getInstance();
    cal.set(2009, 1, 1, 0, 0); // A

    for (int i = 0; i < MAX; i++) {
        Document doc = new Document();
        doc.add(new Field("key", "" + (i + 1), Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("owner", (i < MAX / 2) ? "bob" : "sue", Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("date", DateTools.timeToString(cal.getTimeInMillis(), DateTools.Resolution.DAY),
                Field.Store.YES, Field.Index.NOT_ANALYZED));
        writer.addDocument(doc);

        cal.add(Calendar.DATE, 1);
    }//from  www .  j  a va2s . com

    writer.close();

    searcher = new IndexSearcher(DirectoryReader.open(directory));

    BooleanQuery bq = new BooleanQuery(); // B
    bq.add(new TermQuery(new Term("owner", "bob")), // B
            BooleanClause.Occur.SHOULD); // B
    bq.add(new TermQuery(new Term("owner", "sue")), // B
            BooleanClause.Occur.SHOULD); // B
    query = bq;

    cal.set(2099, 1, 1, 0, 0);
    dateFilter = TermRangeFilter.Less("date", // C
            new BytesRef(DateTools.timeToString( // C
                    cal.getTimeInMillis(), // C
                    DateTools.Resolution.DAY)));// C

    bobFilter = new CachingWrapperFilter( // D
            new QueryWrapperFilter( // D
                    new TermQuery(new Term("owner", "bob")))); // D

    sueFilter = new CachingWrapperFilter( // E
            new QueryWrapperFilter( // E
                    new TermQuery(new Term("owner", "sue")))); // E
}

From source file:com.mathworks.xzheng.tools.FastVectorHighlighterSample.java

License:Apache License

static void makeIndex() throws IOException {
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, analyzer);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);
    for (String d : DOCS) {
        Document doc = new Document();
        doc.add(new Field(F, d, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
        writer.addDocument(doc);
    }//from w  w w. ja  v  a  2s. c  o m
    writer.close();
}

From source file:com.meizu.nlp.classification.utils.DatasetSplitter.java

License:Apache License

/**
 * Split a given index into 3 indexes for training, test and cross validation tasks respectively
 *
 * @param originalIndex        an {@link org.apache.lucene.index.LeafReader} on the source index
 * @param trainingIndex        a {@link Directory} used to write the training index
 * @param testIndex            a {@link Directory} used to write the test index
 * @param crossValidationIndex a {@link Directory} used to write the cross validation index
 * @param analyzer             {@link Analyzer} used to create the new docs
 * @param fieldNames           names of fields that need to be put in the new indexes or <code>null</code> if all should be used
 * @throws IOException if any writing operation fails on any of the indexes
 *//* w  ww  . j  a  v  a 2  s.  c  o  m*/
public void split(LeafReader originalIndex, Directory trainingIndex, Directory testIndex,
        Directory crossValidationIndex, Analyzer analyzer, String... fieldNames) throws IOException {

    // create IWs for train / test / cv IDXs
    IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(analyzer));
    IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(analyzer));
    IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(analyzer));

    try {
        int size = originalIndex.maxDoc();

        IndexSearcher indexSearcher = new IndexSearcher(originalIndex);
        TopDocs topDocs = indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE);

        // set the type to be indexed, stored, with term vectors
        FieldType ft = new FieldType(TextField.TYPE_STORED);
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorOffsets(true);
        ft.setStoreTermVectorPositions(true);

        int b = 0;

        // iterate over existing documents
        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {

            // create a new document for indexing
            Document doc = new Document();
            if (fieldNames != null && fieldNames.length > 0) {
                for (String fieldName : fieldNames) {
                    doc.add(new Field(fieldName,
                            originalIndex.document(scoreDoc.doc).getField(fieldName).stringValue(), ft));
                }
            } else {
                for (IndexableField storableField : originalIndex.document(scoreDoc.doc).getFields()) {
                    if (storableField.readerValue() != null) {
                        doc.add(new Field(storableField.name(), storableField.readerValue(), ft));
                    } else if (storableField.binaryValue() != null) {
                        doc.add(new Field(storableField.name(), storableField.binaryValue(), ft));
                    } else if (storableField.stringValue() != null) {
                        doc.add(new Field(storableField.name(), storableField.stringValue(), ft));
                    } else if (storableField.numericValue() != null) {
                        doc.add(new Field(storableField.name(), storableField.numericValue().toString(), ft));
                    }
                }
            }

            // add it to one of the IDXs
            if (b % 2 == 0 && testWriter.maxDoc() < size * testRatio) {
                testWriter.addDocument(doc);
            } else if (cvWriter.maxDoc() < size * crossValidationRatio) {
                cvWriter.addDocument(doc);
            } else {
                trainingWriter.addDocument(doc);
            }
            b++;
        }
    } catch (Exception e) {
        throw new IOException(e);
    } finally {
        testWriter.commit();
        cvWriter.commit();
        trainingWriter.commit();
        // close IWs
        testWriter.close();
        cvWriter.close();
        trainingWriter.close();
    }
}

From source file:com.meltmedia.cadmium.search.SearchContentPreprocessor.java

License:Apache License

void writeIndex(final IndexWriter indexWriter, File contentDir) throws Exception {
    new ContentScanTemplate(HTML_FILE_FILTER) {

        private Jerry.JerryParser jerryParser = null;

        @Override/*from w w  w. j a v  a2  s .co  m*/
        public void handleFile(File file) throws Exception {
            try {
                if (jerryParser == null) {
                    jerryParser = Jerry.jerry().enableHtmlMode();
                    jerryParser.getDOMBuilder().setCaseSensitive(false);
                    jerryParser.getDOMBuilder().setParseSpecialTagsAsCdata(true);
                    jerryParser.getDOMBuilder().setSelfCloseVoidTags(false);
                    jerryParser.getDOMBuilder().setConditionalCommentExpression(null);
                    jerryParser.getDOMBuilder().setEnableConditionalComments(false);
                    jerryParser.getDOMBuilder().setImpliedEndTags(false);
                    jerryParser.getDOMBuilder().setIgnoreComments(true);
                }
                String htmlContent = FileUtils.readFileToString(file, "UTF-8");
                Jerry jerry = jerryParser.parse(htmlContent);

                // if we should not index this file, move on.
                if (!shouldIndex(jerry))
                    return;

                String title = jerry.$("html > head > title").text();

                Jerry removals = jerry.$("title,head,script,[cadmium=\"no-index\"]");
                if (removals.size() > 0) {
                    log.debug("Removing {} element[s]", removals.length());
                    removals.remove();
                } else {
                    log.debug("No elements to remove");
                }

                String textContent = jerry.$("body").text();

                Document doc = new Document();
                doc.add(new TextField("title", title, Field.Store.YES));
                doc.add(new TextField("content", textContent, Field.Store.YES));
                doc.add(new TextField("path", file.getPath().replaceFirst(dataDir.getPath(), ""),
                        Field.Store.YES));
                indexWriter.addDocument(doc);
            } catch (Throwable t) {
                log.warn("Failed to index page [" + file + "]", t);
            }

        }
    }.scan(contentDir);

}

From source file:com.miliworks.virgo.test.LuceneIndexAndSearchDemo.java

License:Apache License

/**
 * //  w ww  .  j a v a2  s  .c  o  m
 * ???
 * @param args
 */
public static void main(String[] args) {
    //Lucene Document??
    String fieldName = "text";
    //
    String text = "IK Analyzer???????";

    //IKAnalyzer?
    Analyzer analyzer = new IKAnalyzer(true);

    Directory directory = null;
    IndexWriter iwriter = null;
    IndexReader ireader = null;
    IndexSearcher isearcher = null;
    try {
        //
        directory = new RAMDirectory();

        //?IndexWriterConfig
        IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40, analyzer);
        iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
        iwriter = new IndexWriter(directory, iwConfig);
        //
        Document doc = new Document();
        doc.add(new StringField("ID", "10000", Field.Store.YES));
        doc.add(new TextField(fieldName, text, Field.Store.YES));
        iwriter.addDocument(doc);
        iwriter.close();

        //?**********************************
        //?   
        ireader = DirectoryReader.open(directory);
        isearcher = new IndexSearcher(ireader);

        String keyword = "?";
        //QueryParser?Query
        QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
        qp.setDefaultOperator(QueryParser.AND_OPERATOR);
        Query query = qp.parse(keyword);
        System.out.println("Query = " + query);

        //?5?
        TopDocs topDocs = isearcher.search(query, 5);
        System.out.println("" + topDocs.totalHits);
        //
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (int i = 0; i < topDocs.totalHits; i++) {
            Document targetDoc = isearcher.doc(scoreDocs[i].doc);
            System.out.println("" + targetDoc.toString());
        }

    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (LockObtainFailedException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (ParseException e) {
        e.printStackTrace();
    } finally {
        if (ireader != null) {
            try {
                ireader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        if (directory != null) {
            try {
                directory.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:com.mmiagency.knime.nodes.keyworddensity.util.KeywordDensityHelper.java

License:Open Source License

public void execute() throws IOException {

    org.jsoup.nodes.Document jdoc = null;

    // pull content using Jsoup 
    if (m_content != null && !m_content.trim().isEmpty()) {
        jdoc = Jsoup.parse(m_content);/*from   ww  w. j a  va2  s. c  o  m*/
    } else {
        Connection conn = Jsoup.connect(m_url);

        conn.validateTLSCertificates(false);
        conn.followRedirects(true);
        conn.userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/40.0");
        conn.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
        conn.header("Accept-Language", "en-US,en;q=0.5");
        conn.header("Accept-Encoding", "gzip, deflate");

        conn.execute();
        jdoc = conn.get();
    }

    StringWriter text = new StringWriter();

    if (m_includeMetaKeywords) {
        text.write(jdoc.select("meta[name=keywords]").attr("content"));
        text.write(" ");
    }
    if (m_includeMetaDescription) {
        text.write(jdoc.select("meta[name=description]").attr("content"));
        text.write(" ");
    }
    if (m_includePageTitle) {
        text.write(jdoc.select("title").text());
        text.write(" ");
    }

    text.write(jdoc.select("body").text());

    // analyze content with Lucene
    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    Directory directory = new RAMDirectory();
    IndexWriter indexWriter = new IndexWriter(directory, analyzer, MaxFieldLength.LIMITED);

    Document doc = new Document();
    Field textField = new Field("content", text.toString(), Field.Store.YES, Field.Index.ANALYZED,
            TermVector.WITH_POSITIONS_OFFSETS);

    doc.add(textField);

    indexWriter.addDocument(doc);
    indexWriter.commit();
    indexWriter.close();

    IndexReader indexReader = IndexReader.open(directory, true);

    TermFreqVector termFreqVector = null;

    for (int i = 0; i < indexReader.maxDoc(); i++) {
        termFreqVector = indexReader.getTermFreqVector(i, "content");

        String[] terms = termFreqVector.getTerms();
        int[] freqs = termFreqVector.getTermFrequencies();

        for (int n = 0; n < termFreqVector.size(); n++) {
            if (m_excludeList.contains(terms[n])) {
                continue;
            }
            add(terms[n], freqs[n]);
        }
    }

    indexReader.close();
    directory.close();

    // sort map by value
    sortMap();
}