Example usage for org.apache.lucene.index IndexWriter addDocument

List of usage examples for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException 

Source Link

Document

Adds a document to this index.

Usage

From source file:com.tamingtext.frankenstein.Frankenstein.java

License:Apache License

/**
 * Index the content of Frankenstein//w w w.ja v a2 s . com
 *
 * @throws IOException
 */
private void index() throws IOException {
    System.out.println("Indexing Frankenstein");
    InputStream stream = getClass().getClassLoader().getResourceAsStream("frankenstein-gutenberg.txt");
    BufferedReader reader = new BufferedReader(new InputStreamReader(stream));
    //let's index paragraphs at a time
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36));
    directory = new RAMDirectory();
    IndexWriter iw = new IndexWriter(directory, conf);
    String line;
    StringBuilder paraBuffer = new StringBuilder(2048);
    int lines = 0;
    int paragraphs = 0;
    int paragraphLines = 0;
    while ((line = reader.readLine()) != null) {
        if (line.contains("End of the Project Gutenberg")) {//we are in the license section at the end of the book
            break;
        }
        if (line.startsWith("#")) {//skip comments
            continue;
        }
        //if the line is blank, we have a paragraph, so let's index it
        if (line.matches("^\\s*$") && paraBuffer.length() > 0) {
            Document doc = new Document();
            //We can retrieve by paragraph number if we want
            String theString = paraBuffer.toString();
            theString.trim();
            if (theString.length() > 0 && theString.matches("^\\s*$") == false) {
                addMetadata(doc, lines, paragraphs, paragraphLines);
                doc.add(new Field("paragraph", theString, Field.Store.YES, Field.Index.ANALYZED));//add the main content
                iw.addDocument(doc);//Index the document
                paragraphs++;
            }
            //reset some of our state
            paraBuffer.setLength(0);//we are done w/ this paragraph
            paragraphLines = 0;
        } else {
            paraBuffer.append(line).append(' ');
        }
        lines++;
        paragraphLines++;
    }
    System.out.println("Processed " + lines + " lines.  Paragraphs: " + paragraphs);
    iw.close();
}

From source file:com.tamingtext.fuzzy.OverlapMeasures.java

License:Apache License

public TopDocs cosine(String queryTerm, int n, String... terms) throws IOException, ParseException {
    Directory directory = new RAMDirectory();
    final Pattern pattern = Pattern.compile(".");
    Analyzer analyzer = new Analyzer() {
        @Override//from w  w  w  .  ja  v  a 2  s.  co  m
        public TokenStream tokenStream(String fieldName, Reader reader) {
            TokenStream result = null;
            try {
                result = new PatternTokenizer(reader, pattern, 0);
            } catch (IOException e) {
            }
            return result;
        }
    };
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, analyzer);
    IndexWriter writer = new IndexWriter(directory, conf);
    for (String term : terms) {
        Document doc = new Document();
        doc.add(new Field("chars", term, Field.Store.YES, Field.Index.ANALYZED));
        writer.addDocument(doc);
    }
    writer.close();
    IndexReader reader = IndexReader.open(directory);
    IndexSearcher searcher = new IndexSearcher(reader);
    TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), terms.length);
    for (int i = 0; i < topDocs.scoreDocs.length; i++) {
        System.out.println("Id: " + topDocs.scoreDocs[i].doc + " Val: "
                + searcher.doc(topDocs.scoreDocs[i].doc).get("chars"));
    }
    QueryParser qp = new QueryParser(Version.LUCENE_36, "chars", analyzer);
    Query query = qp.parse(queryTerm);
    return searcher.search(query, n);
}

From source file:com.taobao.common.tedis.support.lucene.analysis.xanalyzer.TestHighLight.java

License:Open Source License

/**
 * @param args/*from w  w  w . j a  v  a2s.c o  m*/
 */
public static void main(String[] args) {

    Directory ramDir = new RAMDirectory();
    try {
        IndexWriter writer = new IndexWriter(ramDir, /*
                                                      * new
                                                      * StandardAnalyzer()/
                                                      */XFactory.getWriterAnalyzer());
        Document doc = new Document();
        Field fd = new Field(FIELD_NAME, CONTENT, Field.Store.YES, Field.Index.TOKENIZED,
                Field.TermVector.WITH_POSITIONS_OFFSETS);
        doc.add(fd);
        writer.addDocument(doc);
        writer.optimize();
        writer.close();

        IndexReader reader = IndexReader.open(ramDir);
        String queryString = QUERY;
        QueryParser parser = new QueryParser(FIELD_NAME, /*
                                                          * new
                                                          * StandardAnalyzer
                                                          * ()/
                                                          */XFactory.getWriterAnalyzer());
        Query query = parser.parse(queryString);
        System.out.println(query);
        Searcher searcher = new IndexSearcher(ramDir);
        query = query.rewrite(reader);
        System.out.println(query);
        System.out.println("Searching for: " + query.toString(FIELD_NAME));
        Hits hits = searcher.search(query);

        BoldFormatter formatter = new BoldFormatter();
        Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query));
        highlighter.setTextFragmenter(new SimpleFragmenter(50));
        for (int i = 0; i < hits.length(); i++) {
            String text = hits.doc(i).get(FIELD_NAME);
            int maxNumFragmentsRequired = 5;
            String fragmentSeparator = "...";
            TermPositionVector tpv = (TermPositionVector) reader.getTermFreqVector(hits.id(i), FIELD_NAME);
            TokenStream tokenStream = TokenSources.getTokenStream(tpv);
            /*
             * TokenStream tokenStream2= (new StandardAnalyzer())
             * //XFactory.getWriterAnalyzer() .tokenStream(FIELD_NAME,new
             * StringReader(text));
             *
             * do { Token t = tokenStream2.next(); if(t==null)break;
             * System.out.println("\t" + t.startOffset() + "," +
             * t.endOffset() + "\t" + t.termText()); }while(true);
             */
            String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
                    fragmentSeparator);
            System.out.println("\n" + result);
        }
        reader.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.tekstosense.stemmer.index.Indexer.java

License:Open Source License

/**
 * Adds the doc.//from  ww w.  j  ava  2s. com
 *
 * @param w
 *            the w
 * @param title
 *            the title
 * @param isbn
 *            the isbn
 * @throws IOException
 *             Signals that an I/O exception has occurred.
 */
private static void addDoc(IndexWriter w, String title, String isbn) throws IOException {
    Document doc = new Document();
    doc.add(new TextField("title", title, Store.YES));
    doc.add(new StringField("isbn", isbn, Store.YES));
    w.addDocument(doc);
}

From source file:com.test.LuceneDemo.java

License:Apache License

@Test
public void test() throws IOException, org.apache.lucene.queryparser.classic.ParseException {
    Analyzer analyzer = new StandardAnalyzer();

    // Store the index in memory:
    Directory directory = new RAMDirectory();
    // To store an index on disk, use this instead:
    //Directory directory = FSDirectory.open("/tmp/testindex");
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    IndexWriter iwriter = new IndexWriter(directory, config);
    Document doc = new Document();
    String text = "This is the text to be indexed.";
    doc.add(new Field("fieldname", text, TextField.TYPE_STORED));
    iwriter.addDocument(doc);
    iwriter.close();//www. j  a v a 2s .  co  m

    // Now search the index:
    DirectoryReader ireader = DirectoryReader.open(directory);
    IndexSearcher isearcher = new IndexSearcher(ireader);
    // Parse a simple query that searches for "text":
    QueryParser parser = new QueryParser("fieldname", analyzer);
    Query query = parser.parse("indexed");
    ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs;
    assertEquals(1, hits.length);
    // Iterate through the results:
    for (int i = 0; i < hits.length; i++) {
        Document hitDoc = isearcher.doc(hits[i].doc);
        assertEquals("This is the text to be indexed.", hitDoc.get("fieldname"));
    }
    ireader.close();
    directory.close();
}

From source file:com.tistory.devyongsik.demo.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * /* ww  w  .  j a  v a2s  .c  o  m*/
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i])); //10.         .
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file); //11.    Stream .
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                //12.   .  Document  Row.
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:

                //13.   Document,  Document     .
                //      .   path   path .
                //          .
                Field pathField = new Field("path", file.getPath(), Field.Store.YES,
                        Field.Index.NOT_ANALYZED_NO_NORMS);
                pathField.setOmitTermFreqAndPositions(true);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a NumericField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.

                //14. Field        .
                //      ,    .
                NumericField modifiedField = new NumericField("modified");
                modifiedField.setLongValue(file.lastModified());
                doc.add(modifiedField);

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.

                //15. path, modified,   contents   Document .
                //       ,    String, Numeric, Reader  
                //         .
                doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { //16.        add...
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file); //17. Create or Update update .
                    //   3.X   API .
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.vmware.dcp.services.common.LuceneBlobIndexService.java

License:Open Source License

protected void handlePost(Operation post) {
    if (post.isRemote()) {
        post.fail(new IllegalStateException("Remote requests not allowed"));
        return;/*  ww  w  .j  a  v a2 s.c om*/
    }

    Map<String, String> params = UriUtils.parseUriQueryParams(post.getUri());
    String key = params.get(URI_PARAM_NAME_KEY);
    if (key == null) {
        post.fail(new IllegalArgumentException("key query parameter is required"));
        return;
    }

    String updateTimeParam = params.get(URI_PARAM_NAME_UPDATE_TIME);

    if (updateTimeParam == null) {
        post.fail(new IllegalArgumentException("update time query parameter is required"));
        return;
    }

    long updateTime = Long.parseLong(updateTimeParam);
    IndexWriter wr = this.writer;
    if (wr == null) {
        post.fail(new CancellationException());
        return;
    }

    try {
        Object content = post.getBodyRaw();
        if (content == null) {
            post.fail(new IllegalArgumentException("service instance is required"));
            return;
        }
        byte[] binaryContent = new byte[this.maxBinaryContextSizeBytes];
        int count = Utils.toBytes(content, binaryContent, 0);
        Document doc = new Document();
        Field binaryContentField = new StoredField(LUCENE_FIELD_NAME_BINARY_CONTENT, binaryContent, 0, count);
        doc.add(binaryContentField);
        Field keyField = new StringField(URI_PARAM_NAME_KEY, key, Field.Store.NO);
        doc.add(keyField);

        Field updateTimeField = new LongField(URI_PARAM_NAME_UPDATE_TIME, updateTime, this.longStoredField);
        doc.add(updateTimeField);
        wr.addDocument(doc);
        this.indexUpdateTimeMicros = Utils.getNowMicrosUtc();
        post.setBody(null).complete();
    } catch (Throwable e) {
        logSevere(e);
        post.fail(e);
    }
}

From source file:com.vmware.dcp.services.common.LuceneDocumentIndexService.java

License:Open Source License

private void addDocumentToIndex(Operation op, Document doc, ServiceDocument sd, ServiceDocumentDescription desc)
        throws IOException {
    IndexWriter wr = this.writer;
    if (wr == null) {
        op.fail(new CancellationException());
        return;//from   w  ww.ja  v a2s .c  o m
    }

    long start = Utils.getNowMicrosUtc();
    wr.addDocument(doc);
    updateSelfLinkInfo(sd);

    long end = Utils.getNowMicrosUtc();
    if (hasOption(ServiceOption.INSTRUMENTATION)) {
        ServiceStat s = getHistogramStat(STAT_NAME_INDEXING_DURATION_MICROS);
        setStat(s, end - start);
    }

    op.setBody(null).complete();
    checkDocumentRetentionLimit(sd, desc);

    applyActiveQueries(sd, desc);
}

From source file:com.vmware.xenon.services.common.LuceneBlobIndexService.java

License:Open Source License

public void handlePost(Operation post) {
    if (post.isRemote()) {
        post.fail(new IllegalStateException("Remote requests not allowed"));
        return;/*ww w.  java 2s. c o m*/
    }

    Map<String, String> params = UriUtils.parseUriQueryParams(post.getUri());
    String key = params.get(URI_PARAM_NAME_KEY);
    if (key == null) {
        post.fail(new IllegalArgumentException("key query parameter is required"));
        return;
    }

    String updateTimeParam = params.get(URI_PARAM_NAME_UPDATE_TIME);

    if (updateTimeParam == null) {
        post.fail(new IllegalArgumentException("update time query parameter is required"));
        return;
    }

    long updateTime = Long.parseLong(updateTimeParam);
    IndexWriter wr = this.writer;
    if (wr == null) {
        post.fail(new CancellationException());
        return;
    }

    try {
        Object content = post.getBodyRaw();
        if (content == null) {
            post.fail(new IllegalArgumentException("service instance is required"));
            return;
        }

        byte[] binaryContent = getBuffer();
        int count = Utils.toBytes(content, binaryContent, 0);
        Document doc = new Document();
        Field binaryContentField = new StoredField(LUCENE_FIELD_NAME_BINARY_CONTENT, binaryContent, 0, count);
        doc.add(binaryContentField);
        Field keyField = new StringField(URI_PARAM_NAME_KEY, key, Field.Store.NO);
        doc.add(keyField);

        LuceneDocumentIndexService.addNumericField(doc, URI_PARAM_NAME_UPDATE_TIME, updateTime, true);

        wr.addDocument(doc);
        this.indexUpdateTimeMicros = Utils.getNowMicrosUtc();
        post.setBody(null).complete();
    } catch (Throwable e) {
        logSevere(e);
        post.fail(e);
    }
}

From source file:com.vmware.xenon.services.common.LuceneDocumentIndexService.java

License:Open Source License

private void addDocumentToIndex(Operation op, Document doc, ServiceDocument sd, ServiceDocumentDescription desc)
        throws IOException {
    IndexWriter wr = this.writer;
    if (wr == null) {
        op.fail(new CancellationException());
        return;/*  w w  w.ja  va 2 s. co m*/
    }

    long start = Utils.getNowMicrosUtc();
    wr.addDocument(doc);
    long end = Utils.getNowMicrosUtc();

    // Use time AFTER index was updated to be sure that it can be compared
    // against the time the searcher was updated and have this change
    // be reflected in the new searcher. If the start time would be used,
    // it is possible to race with updating the searcher and NOT have this
    // change be reflected in the searcher.
    updateLinkAccessTime(end, sd.documentSelfLink);

    if (hasOption(ServiceOption.INSTRUMENTATION)) {
        ServiceStat s = getHistogramStat(STAT_NAME_INDEXING_DURATION_MICROS);
        setStat(s, end - start);
    }

    op.setBody(null).complete();
    checkDocumentRetentionLimit(sd, desc);
    applyActiveQueries(sd, desc);
}