Example usage for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException

Source Link

Document

Adds a document to this index.

Usage

From source file:com.github.msarhan.lucene.ArabicRootExtractorAnalyzerTests.java

License:Open Source License

private void addDoc(IndexWriter w, String title, String number) {
    Document doc = new Document();
    doc.add(new TextField("title", title, Field.Store.YES));
    doc.add(new StringField("number", number, Field.Store.YES));
    try {//w  ww .  j av  a 2  s .  c  o m
        w.addDocument(doc);
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:com.github.msarhan.lucene.ArabicRootExtractorAnalyzerTests.java

License:Open Source License

@Test
public void testInlineStemmer() throws IOException, ParseException {

    //Initialize the index
    Directory index = new RAMDirectory();
    Analyzer analyzer = new ArabicRootExtractorAnalyzer();
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    IndexWriter writer = new IndexWriter(index, config);

    Document doc = new Document();
    doc.add(new StringField("number", "1", Field.Store.YES));
    doc.add(new TextField("title", "?? ? ? ??",
            Field.Store.YES));/* w  w w  .java2 s  .c  o  m*/
    writer.addDocument(doc);

    doc = new Document();
    doc.add(new StringField("number", "2", Field.Store.YES));
    doc.add(new TextField("title", "? ?? ? ?",
            Field.Store.YES));
    writer.addDocument(doc);

    doc = new Document();
    doc.add(new StringField("number", "3", Field.Store.YES));
    doc.add(new TextField("title", "? ??", Field.Store.YES));
    writer.addDocument(doc);
    writer.close();
    //~

    //Query the index
    String queryStr = "";
    Query query = new QueryParser("title", analyzer).parse(queryStr);

    int hitsPerPage = 5;
    IndexReader reader = DirectoryReader.open(index);
    IndexSearcher searcher = new IndexSearcher(reader);
    TopDocs docs = searcher.search(query, hitsPerPage, Sort.INDEXORDER);

    ScoreDoc[] hits = docs.scoreDocs;
    //~

    //Print results
    /*
    System.out.println("Found " + hits.length + " hits:");
    for (ScoreDoc hit : hits) {
       int docId = hit.doc;
       Document d = searcher.doc(docId);
       System.out.printf("\t(%s): %s\n", d.get("number"), d.get("title"));
    }
    */
    //~

}

From source file:com.github.tenorviol.gitsearch.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 *
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the//from  w  w w  . j a  v  a 2 s  .c o  m
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // TODO: make these exclusions configurable
    String fileName = file.getName();
    if (fileName.charAt(0) == '.') {
        return;
    }
    int dotLoc = fileName.lastIndexOf('.');
    String extension = fileName.substring(dotLoc + 1);
    // known binary extensions
    if (extension.equals("jpg") || extension.equals("png") || extension.equals("gif") || extension.equals("pdf")
            || extension.equals("fla") || extension.equals("flv") || extension.equals("swf")
            || extension.equals("swz")) {
        return;
    }

    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new Field("path", file.getPath(), Field.Store.YES,
                        Field.Index.NOT_ANALYZED_NO_NORMS);
                pathField.setIndexOptions(IndexOptions.DOCS_ONLY);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a NumericField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                NumericField modifiedField = new NumericField("modified");
                modifiedField.setLongValue(file.lastModified());
                doc.add(modifiedField);

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so
                    // we use updateDocument instead to replace the old one matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.github.tteofili.looseen.MinHashClassifier.java

License:Apache License

public MinHashClassifier(IndexReader reader, String textField, String categoryField, int min, int hashCount,
        int hashSize) {
    this.min = min;
    this.hashCount = hashCount;
    this.hashSize = hashSize;
    try {//from w  w w  .ja  v  a 2  s  .  c  o m
        Analyzer analyzer = createMinHashAnalyzer(min, hashCount, hashSize);
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        directory = new RAMDirectory();
        IndexWriter writer = new IndexWriter(directory, config);
        for (int i = 0; i < reader.maxDoc(); i++) {
            Document document = new Document();
            Document d = reader.document(i);
            String textValue = d.getField(textField).stringValue();
            String categoryValue = d.getField(categoryField).stringValue();
            document.add(new TextField(TEXT_FIELD, textValue, Field.Store.NO));
            document.add(new StringField(CLASS_FIELD, categoryValue, Field.Store.YES));
            writer.addDocument(document);
        }
        writer.commit();
        writer.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE);
}

From source file:com.github.tteofili.looseen.Test20NewsgroupsClassification.java

License:Apache License

void buildIndex(File indexDir, IndexWriter indexWriter) throws IOException {
    File[] groupsDir = indexDir.listFiles();
    if (groupsDir != null) {
        for (File group : groupsDir) {
            String groupName = group.getName();
            File[] posts = group.listFiles();
            if (posts != null) {
                for (File postFile : posts) {
                    String number = postFile.getName();
                    NewsPost post = parse(postFile, groupName, number);
                    Document d = new Document();
                    d.add(new StringField(CATEGORY_FIELD, post.getGroup(), Field.Store.YES));
                    d.add(new SortedDocValuesField(CATEGORY_FIELD, new BytesRef(post.getGroup())));
                    d.add(new TextField(SUBJECT_FIELD, post.getSubject(), Field.Store.YES));
                    d.add(new TextField(BODY_FIELD, post.getBody(), Field.Store.YES));
                    indexWriter.addDocument(d);
                }/*w  w  w. ja va 2s  .c o m*/
            }
        }
    }
    indexWriter.commit();
}

From source file:com.github.tteofili.looseen.TestWikipediaClassification.java

License:Apache License

private static void importWikipedia(File dump, IndexWriter indexWriter) throws Exception {
    long start = System.currentTimeMillis();
    int count = 0;
    System.out.format("Importing %s...%n", dump);

    String title = null;//from  www .j av a2 s. c  o m
    String text = null;
    Set<String> cats = new HashSet<>();

    XMLInputFactory factory = XMLInputFactory.newInstance();
    StreamSource source;
    if (dump.getName().endsWith(".xml")) {
        source = new StreamSource(dump);
    } else {
        throw new RuntimeException("can index only wikipedia XML files");
    }
    XMLStreamReader reader = factory.createXMLStreamReader(source);
    while (reader.hasNext()) {
        if (count == Integer.MAX_VALUE) {
            break;
        }
        switch (reader.next()) {
        case XMLStreamConstants.START_ELEMENT:
            if ("title".equals(reader.getLocalName())) {
                title = reader.getElementText();
            } else if (TEXT_FIELD.equals(reader.getLocalName())) {
                text = reader.getElementText();
                Matcher matcher = pattern.matcher(text);
                int pos = 0;
                while (matcher.find(pos)) {
                    String group = matcher.group(1);
                    String catName = group.replaceAll("\\|\\s", "").replaceAll("\\|\\*", "");
                    Collections.addAll(cats, catName.split("\\|"));
                    pos = matcher.end();
                }
            }
            break;
        case XMLStreamConstants.END_ELEMENT:
            if ("page".equals(reader.getLocalName())) {
                Document page = new Document();
                if (title != null) {
                    page.add(new TextField(TITLE_FIELD, title, StoredField.Store.YES));
                }
                if (text != null) {
                    page.add(new TextField(TEXT_FIELD, text, StoredField.Store.YES));
                }
                for (String cat : cats) {
                    page.add(new StringField(CATEGORY_FIELD, cat, Field.Store.YES));
                    page.add(new SortedSetDocValuesField(CATEGORY_FIELD, new BytesRef(cat)));
                }
                indexWriter.addDocument(page);
                cats.clear();
                count++;
                if (count % 100000 == 0) {
                    indexWriter.commit();
                    System.out.format("Committed %d pages%n", count);
                }
            }
            break;
        }
    }

    indexWriter.commit();

    long millis = System.currentTimeMillis() - start;
    System.out.format("Imported %d pages in %d seconds (%.2fms/page)%n", count, millis / 1000,
            (double) millis / count);
}

From source file:com.github.wxiaoqi.search.lucene.LuceneDao.java

License:Open Source License

public void create(IndexObject indexObject) {

    IndexWriter indexWriter = null;
    try {//from   w ww. j  a  v  a 2  s.com
        IndexWriterConfig config = new IndexWriterConfig(this.getAnalyzer());
        indexWriter = new IndexWriter(this.getDirectory(), config);
        indexWriter.addDocument(DocumentUtil.IndexObject2Document(indexObject));
        indexWriter.commit();
    } catch (Exception e) {
        e.printStackTrace();
        try {
            indexWriter.rollback();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
    } finally {
        try {
            indexWriter.close();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
    }
}

From source file:com.globalsight.ling.lucene.Index.java

License:Apache License

public void addDocument(long p_mainId, long p_subId, String p_text) throws IOException {
    synchronized (m_state) {
        if (m_state != STATE_OPENED) {
            throw new IOException("index is not available");
        }//from  w w  w.  j  a  v a2s .  com
    }

    // clean cache if have
    LuceneCache.cleanLuceneCache(m_directory);

    try {
        m_lock.writeLock().acquire();

        IndexWriter tempWriter = null;
        try {
            tempWriter = getIndexWriter(false);
            Document doc = getDocument(p_mainId, p_subId, p_text);
            tempWriter.addDocument(doc);
        } finally {
            m_lock.writeLock().release();
            IOUtils.closeWhileHandlingException(tempWriter);
        }
    } catch (InterruptedException ex) {
        throw new IOException(ex.getMessage());
    }
}

From source file:com.globalsight.ling.tm2.lucene.LuceneIndexWriter.java

License:Apache License

/**
 * Indexes segments. To maintain index integrity, indexes are at
 * first created in memory and merged into a file system index.
 *
 * @param p_tuvs List of BaseTmTuv, SegmentsForSave.AddTuv, or TM3Tuv
 * @param p_sourceLocale true if p_tuvs are source locale segments
 * @param p_indexTargetLocales true for TM3, false for TM2
 *//*from w  w  w . jav a2s  .  co  m*/
public void index(List p_tuvs, boolean p_sourceLocale, boolean p_indexTargetLocales) throws Exception {
    IndexWriterConfig conf = new IndexWriterConfig(LuceneUtil.VERSION, m_analyzer);
    conf.setOpenMode(m_isFirst ? OpenMode.CREATE : OpenMode.CREATE_OR_APPEND);
    IndexWriter fsIndexWriter = new IndexWriter(m_directory, conf);

    try {
        for (Iterator it = p_tuvs.iterator(); it.hasNext();) {
            Object tuv = it.next();

            Document doc = tuv instanceof BaseTmTuv
                    ? createDocumentFromBaseTmTuv((BaseTmTuv) tuv, p_sourceLocale, p_indexTargetLocales)
                    : tuv instanceof AddTuv
                            ? createDocumentFromAddTuv((AddTuv) tuv, p_sourceLocale, p_indexTargetLocales)
                            : tuv instanceof TM3Tuv
                                    ? createDocumentFromTM3Tuv((TM3Tuv<GSTuvData>) tuv, p_sourceLocale,
                                            p_indexTargetLocales)
                                    : null;

            fsIndexWriter.addDocument(doc);
        }
    } finally {
        fsIndexWriter.close();
    }

    // clean cache if have
    LuceneCache.cleanLuceneCache(m_indexDir);
}

From source file:com.gmail.mosoft521.luceneDemo.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * <p/>// ww  w .  ja  va 2s . c om
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * &gt;WriteLineDocTask</a&gt;.
 *
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file   The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents",
                        new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so
                    // we use updateDocument instead to replace the old one matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}