Example usage for org.apache.lucene.index IndexWriter addDocument

List of usage examples for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException 

Source Link

Document

Adds a document to this index.

Usage

From source file:com.mathworks.xzheng.searching.PhraseQueryTest.java

License:Apache License

protected void setUp() throws IOException {
    dir = new RAMDirectory();

    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46,
            new WhitespaceAnalyzer(Version.LUCENE_46));

    IndexWriter writer = new IndexWriter(dir, config);
    Document doc = new Document();
    doc.add(new Field("field", // 1
            "the quick brown fox jumped over the lazy dog", // 1
            Field.Store.YES, // 1
            Field.Index.ANALYZED)); // 1
    writer.addDocument(doc);
    writer.close();/*from   ww w. j a va2  s  . c o m*/

    searcher = new IndexSearcher(DirectoryReader.open(dir));
}

From source file:com.mathworks.xzheng.searching.ScoreTest.java

License:Apache License

private void indexSingleFieldDocs(Field[] fields) throws Exception {
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46,
            new WhitespaceAnalyzer(Version.LUCENE_46));

    IndexWriter writer = new IndexWriter(directory, config);
    for (Field f : fields) {
        Document doc = new Document();
        doc.add(f);/*from   w  ww . j  a va2  s .  com*/
        writer.addDocument(doc);
    }
    writer.forceMerge(1);
    writer.close();
}

From source file:com.mathworks.xzheng.tools.BerkeleyDbIndexer.java

License:Apache License

public static void main(String[] args) throws IOException, DatabaseException {
    if (args.length != 1) {
        System.err.println("Usage: BerkeleyDbIndexer <index dir>");
        System.exit(-1);//ww w.  j a  v a2 s .c  o  m
    }

    File indexFile = new File(args[0]);

    if (indexFile.exists()) {
        File[] files = indexFile.listFiles();

        for (int i = 0; i < files.length; i++)
            if (files[i].getName().startsWith("__"))
                files[i].delete();
        indexFile.delete();
    }

    indexFile.mkdir();

    EnvironmentConfig envConfig = new EnvironmentConfig();
    DatabaseConfig dbConfig = new DatabaseConfig();

    envConfig.setTransactional(true);
    envConfig.setInitializeCache(true);
    envConfig.setInitializeLocking(true);
    envConfig.setInitializeLogging(true);
    envConfig.setAllowCreate(true);
    envConfig.setThreaded(true);
    dbConfig.setAllowCreate(true);
    dbConfig.setType(DatabaseType.BTREE);

    Environment env = new Environment(indexFile, envConfig);

    Transaction txn = env.beginTransaction(null, null);
    Database index = env.openDatabase(txn, "__index__", null, dbConfig);
    Database blocks = env.openDatabase(txn, "__blocks__", null, dbConfig);
    txn.commit();

    txn = env.beginTransaction(null, null);
    DbDirectory directory = new DbDirectory(txn, index, blocks);

    IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_46), true,
            IndexWriter.MaxFieldLength.UNLIMITED);

    Document doc = new Document();
    doc.add(new Field("contents", "The quick brown fox...", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(doc);

    writer.optimize();
    writer.close();

    directory.close();
    txn.commit();

    index.close();
    blocks.close();
    env.close();

    System.out.println("Indexing Complete");
}

From source file:com.mathworks.xzheng.tools.BerkeleyDbJEIndexer.java

License:Apache License

public static void main(String[] args) throws IOException, DatabaseException {
    if (args.length != 1) {
        System.err.println("Usage: BerkeleyDbIndexer <index dir>");
        System.exit(-1);//from w w  w  .j a va  2 s . c  o m
    }

    File indexFile = new File(args[0]);

    if (indexFile.exists()) { // A
        File[] files = indexFile.listFiles(); // A
        for (int i = 0; i < files.length; i++) // A
            if (files[i].getName().startsWith("__")) // A
                files[i].delete(); // A
        indexFile.delete(); // A
    }

    indexFile.mkdir();

    EnvironmentConfig envConfig = new EnvironmentConfig(); // B
    DatabaseConfig dbConfig = new DatabaseConfig(); // B

    envConfig.setTransactional(true); // B
    envConfig.setAllowCreate(true); // B
    dbConfig.setTransactional(true); // B
    dbConfig.setAllowCreate(true); // B

    Environment env = new Environment(indexFile, envConfig); // C

    Transaction txn = env.beginTransaction(null, null); // C
    Database index = env.openDatabase(txn, "__index__", dbConfig); // C
    Database blocks = env.openDatabase(txn, "__blocks__", dbConfig); // C
    txn.commit(); // C
    txn = env.beginTransaction(null, null); // C

    JEDirectory directory = new JEDirectory(txn, index, blocks); // D

    IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_46), true,
            IndexWriter.MaxFieldLength.UNLIMITED);

    Document doc = new Document();
    doc.add(new Field("contents", "The quick brown fox...", Field.Store.YES, Field.Index.ANALYZED));
    writer.addDocument(doc);

    writer.optimize();
    writer.close();

    directory.close();
    txn.commit();

    index.close();
    blocks.close();
    env.close();

    System.out.println("Indexing Complete");
}

From source file:com.mathworks.xzheng.tools.ChainedFilterTest.java

License:Apache License

public void setUp() throws Exception {
    directory = new RAMDirectory();

    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46,
            new WhitespaceAnalyzer(Version.LUCENE_46));

    IndexWriter writer = new IndexWriter(directory, config);

    Calendar cal = Calendar.getInstance();
    cal.set(2009, 1, 1, 0, 0); // A

    for (int i = 0; i < MAX; i++) {
        Document doc = new Document();
        doc.add(new Field("key", "" + (i + 1), Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("owner", (i < MAX / 2) ? "bob" : "sue", Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("date", DateTools.timeToString(cal.getTimeInMillis(), DateTools.Resolution.DAY),
                Field.Store.YES, Field.Index.NOT_ANALYZED));
        writer.addDocument(doc);

        cal.add(Calendar.DATE, 1);
    }//from  www .  j  a va2s . com

    writer.close();

    searcher = new IndexSearcher(DirectoryReader.open(directory));

    BooleanQuery bq = new BooleanQuery(); // B
    bq.add(new TermQuery(new Term("owner", "bob")), // B
            BooleanClause.Occur.SHOULD); // B
    bq.add(new TermQuery(new Term("owner", "sue")), // B
            BooleanClause.Occur.SHOULD); // B
    query = bq;

    cal.set(2099, 1, 1, 0, 0);
    dateFilter = TermRangeFilter.Less("date", // C
            new BytesRef(DateTools.timeToString( // C
                    cal.getTimeInMillis(), // C
                    DateTools.Resolution.DAY)));// C

    bobFilter = new CachingWrapperFilter( // D
            new QueryWrapperFilter( // D
                    new TermQuery(new Term("owner", "bob")))); // D

    sueFilter = new CachingWrapperFilter( // E
            new QueryWrapperFilter( // E
                    new TermQuery(new Term("owner", "sue")))); // E
}

From source file:com.mathworks.xzheng.tools.FastVectorHighlighterSample.java

License:Apache License

static void makeIndex() throws IOException {
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, analyzer);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

    IndexWriter writer = new IndexWriter(dir, config);
    for (String d : DOCS) {
        Document doc = new Document();
        doc.add(new Field(F, d, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
        writer.addDocument(doc);
    }//from w  w w. ja  v  a  2s. c  o m
    writer.close();
}

From source file:com.meizu.nlp.classification.utils.DatasetSplitter.java

License:Apache License

/**
 * Split a given index into 3 indexes for training, test and cross validation tasks respectively
 *
 * @param originalIndex        an {@link org.apache.lucene.index.LeafReader} on the source index
 * @param trainingIndex        a {@link Directory} used to write the training index
 * @param testIndex            a {@link Directory} used to write the test index
 * @param crossValidationIndex a {@link Directory} used to write the cross validation index
 * @param analyzer             {@link Analyzer} used to create the new docs
 * @param fieldNames           names of fields that need to be put in the new indexes or <code>null</code> if all should be used
 * @throws IOException if any writing operation fails on any of the indexes
 *//* w  ww  . j  a  v  a 2  s.  c  o  m*/
public void split(LeafReader originalIndex, Directory trainingIndex, Directory testIndex,
        Directory crossValidationIndex, Analyzer analyzer, String... fieldNames) throws IOException {

    // create IWs for train / test / cv IDXs
    IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(analyzer));
    IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(analyzer));
    IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(analyzer));

    try {
        int size = originalIndex.maxDoc();

        IndexSearcher indexSearcher = new IndexSearcher(originalIndex);
        TopDocs topDocs = indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE);

        // set the type to be indexed, stored, with term vectors
        FieldType ft = new FieldType(TextField.TYPE_STORED);
        ft.setStoreTermVectors(true);
        ft.setStoreTermVectorOffsets(true);
        ft.setStoreTermVectorPositions(true);

        int b = 0;

        // iterate over existing documents
        for (ScoreDoc scoreDoc : topDocs.scoreDocs) {

            // create a new document for indexing
            Document doc = new Document();
            if (fieldNames != null && fieldNames.length > 0) {
                for (String fieldName : fieldNames) {
                    doc.add(new Field(fieldName,
                            originalIndex.document(scoreDoc.doc).getField(fieldName).stringValue(), ft));
                }
            } else {
                for (IndexableField storableField : originalIndex.document(scoreDoc.doc).getFields()) {
                    if (storableField.readerValue() != null) {
                        doc.add(new Field(storableField.name(), storableField.readerValue(), ft));
                    } else if (storableField.binaryValue() != null) {
                        doc.add(new Field(storableField.name(), storableField.binaryValue(), ft));
                    } else if (storableField.stringValue() != null) {
                        doc.add(new Field(storableField.name(), storableField.stringValue(), ft));
                    } else if (storableField.numericValue() != null) {
                        doc.add(new Field(storableField.name(), storableField.numericValue().toString(), ft));
                    }
                }
            }

            // add it to one of the IDXs
            if (b % 2 == 0 && testWriter.maxDoc() < size * testRatio) {
                testWriter.addDocument(doc);
            } else if (cvWriter.maxDoc() < size * crossValidationRatio) {
                cvWriter.addDocument(doc);
            } else {
                trainingWriter.addDocument(doc);
            }
            b++;
        }
    } catch (Exception e) {
        throw new IOException(e);
    } finally {
        testWriter.commit();
        cvWriter.commit();
        trainingWriter.commit();
        // close IWs
        testWriter.close();
        cvWriter.close();
        trainingWriter.close();
    }
}

From source file:com.meltmedia.cadmium.search.SearchContentPreprocessor.java

License:Apache License

void writeIndex(final IndexWriter indexWriter, File contentDir) throws Exception {
    new ContentScanTemplate(HTML_FILE_FILTER) {

        private Jerry.JerryParser jerryParser = null;

        @Override/*from w w  w. j a v  a2  s .co  m*/
        public void handleFile(File file) throws Exception {
            try {
                if (jerryParser == null) {
                    jerryParser = Jerry.jerry().enableHtmlMode();
                    jerryParser.getDOMBuilder().setCaseSensitive(false);
                    jerryParser.getDOMBuilder().setParseSpecialTagsAsCdata(true);
                    jerryParser.getDOMBuilder().setSelfCloseVoidTags(false);
                    jerryParser.getDOMBuilder().setConditionalCommentExpression(null);
                    jerryParser.getDOMBuilder().setEnableConditionalComments(false);
                    jerryParser.getDOMBuilder().setImpliedEndTags(false);
                    jerryParser.getDOMBuilder().setIgnoreComments(true);
                }
                String htmlContent = FileUtils.readFileToString(file, "UTF-8");
                Jerry jerry = jerryParser.parse(htmlContent);

                // if we should not index this file, move on.
                if (!shouldIndex(jerry))
                    return;

                String title = jerry.$("html > head > title").text();

                Jerry removals = jerry.$("title,head,script,[cadmium=\"no-index\"]");
                if (removals.size() > 0) {
                    log.debug("Removing {} element[s]", removals.length());
                    removals.remove();
                } else {
                    log.debug("No elements to remove");
                }

                String textContent = jerry.$("body").text();

                Document doc = new Document();
                doc.add(new TextField("title", title, Field.Store.YES));
                doc.add(new TextField("content", textContent, Field.Store.YES));
                doc.add(new TextField("path", file.getPath().replaceFirst(dataDir.getPath(), ""),
                        Field.Store.YES));
                indexWriter.addDocument(doc);
            } catch (Throwable t) {
                log.warn("Failed to index page [" + file + "]", t);
            }

        }
    }.scan(contentDir);

}

From source file:com.miliworks.virgo.test.LuceneIndexAndSearchDemo.java

License:Apache License

/**
 * //  w ww  .  j a v a2  s  .c  o  m
 * ???
 * @param args
 */
public static void main(String[] args) {
    //Lucene Document??
    String fieldName = "text";
    //
    String text = "IK Analyzer???????";

    //IKAnalyzer?
    Analyzer analyzer = new IKAnalyzer(true);

    Directory directory = null;
    IndexWriter iwriter = null;
    IndexReader ireader = null;
    IndexSearcher isearcher = null;
    try {
        //
        directory = new RAMDirectory();

        //?IndexWriterConfig
        IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40, analyzer);
        iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
        iwriter = new IndexWriter(directory, iwConfig);
        //
        Document doc = new Document();
        doc.add(new StringField("ID", "10000", Field.Store.YES));
        doc.add(new TextField(fieldName, text, Field.Store.YES));
        iwriter.addDocument(doc);
        iwriter.close();

        //?**********************************
        //?   
        ireader = DirectoryReader.open(directory);
        isearcher = new IndexSearcher(ireader);

        String keyword = "?";
        //QueryParser?Query
        QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
        qp.setDefaultOperator(QueryParser.AND_OPERATOR);
        Query query = qp.parse(keyword);
        System.out.println("Query = " + query);

        //?5?
        TopDocs topDocs = isearcher.search(query, 5);
        System.out.println("" + topDocs.totalHits);
        //
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (int i = 0; i < topDocs.totalHits; i++) {
            Document targetDoc = isearcher.doc(scoreDocs[i].doc);
            System.out.println("" + targetDoc.toString());
        }

    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (LockObtainFailedException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (ParseException e) {
        e.printStackTrace();
    } finally {
        if (ireader != null) {
            try {
                ireader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        if (directory != null) {
            try {
                directory.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:com.mmiagency.knime.nodes.keyworddensity.util.KeywordDensityHelper.java

License:Open Source License

public void execute() throws IOException {

    org.jsoup.nodes.Document jdoc = null;

    // pull content using Jsoup 
    if (m_content != null && !m_content.trim().isEmpty()) {
        jdoc = Jsoup.parse(m_content);/*from   ww  w. j a  va2  s. c  o  m*/
    } else {
        Connection conn = Jsoup.connect(m_url);

        conn.validateTLSCertificates(false);
        conn.followRedirects(true);
        conn.userAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:40.0) Gecko/20100101 Firefox/40.0");
        conn.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
        conn.header("Accept-Language", "en-US,en;q=0.5");
        conn.header("Accept-Encoding", "gzip, deflate");

        conn.execute();
        jdoc = conn.get();
    }

    StringWriter text = new StringWriter();

    if (m_includeMetaKeywords) {
        text.write(jdoc.select("meta[name=keywords]").attr("content"));
        text.write(" ");
    }
    if (m_includeMetaDescription) {
        text.write(jdoc.select("meta[name=description]").attr("content"));
        text.write(" ");
    }
    if (m_includePageTitle) {
        text.write(jdoc.select("title").text());
        text.write(" ");
    }

    text.write(jdoc.select("body").text());

    // analyze content with Lucene
    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    Directory directory = new RAMDirectory();
    IndexWriter indexWriter = new IndexWriter(directory, analyzer, MaxFieldLength.LIMITED);

    Document doc = new Document();
    Field textField = new Field("content", text.toString(), Field.Store.YES, Field.Index.ANALYZED,
            TermVector.WITH_POSITIONS_OFFSETS);

    doc.add(textField);

    indexWriter.addDocument(doc);
    indexWriter.commit();
    indexWriter.close();

    IndexReader indexReader = IndexReader.open(directory, true);

    TermFreqVector termFreqVector = null;

    for (int i = 0; i < indexReader.maxDoc(); i++) {
        termFreqVector = indexReader.getTermFreqVector(i, "content");

        String[] terms = termFreqVector.getTerms();
        int[] freqs = termFreqVector.getTermFrequencies();

        for (int n = 0; n < termFreqVector.size(); n++) {
            if (m_excludeList.contains(terms[n])) {
                continue;
            }
            add(terms[n], freqs[n]);
        }
    }

    indexReader.close();
    directory.close();

    // sort map by value
    sortMap();
}