Example usage for org.apache.lucene.index IndexWriter addDocument

List of usage examples for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException 

Source Link

Document

Adds a document to this index.

Usage

From source file:com.github.msarhan.lucene.ArabicRootExtractorAnalyzerTests.java

License:Open Source License

private void addDoc(IndexWriter w, String title, String number) {
    Document doc = new Document();
    doc.add(new TextField("title", title, Field.Store.YES));
    doc.add(new StringField("number", number, Field.Store.YES));
    try {//w  ww .  j av  a 2  s .  c  o m
        w.addDocument(doc);
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:com.github.msarhan.lucene.ArabicRootExtractorAnalyzerTests.java

License:Open Source License

@Test
public void testInlineStemmer() throws IOException, ParseException {

    //Initialize the index
    Directory index = new RAMDirectory();
    Analyzer analyzer = new ArabicRootExtractorAnalyzer();
    IndexWriterConfig config = new IndexWriterConfig(analyzer);
    IndexWriter writer = new IndexWriter(index, config);

    Document doc = new Document();
    doc.add(new StringField("number", "1", Field.Store.YES));
    doc.add(new TextField("title", "?? ? ? ??",
            Field.Store.YES));/* w  w w  .java2 s  .c  o  m*/
    writer.addDocument(doc);

    doc = new Document();
    doc.add(new StringField("number", "2", Field.Store.YES));
    doc.add(new TextField("title", "? ?? ? ?",
            Field.Store.YES));
    writer.addDocument(doc);

    doc = new Document();
    doc.add(new StringField("number", "3", Field.Store.YES));
    doc.add(new TextField("title", "? ??", Field.Store.YES));
    writer.addDocument(doc);
    writer.close();
    //~

    //Query the index
    String queryStr = "";
    Query query = new QueryParser("title", analyzer).parse(queryStr);

    int hitsPerPage = 5;
    IndexReader reader = DirectoryReader.open(index);
    IndexSearcher searcher = new IndexSearcher(reader);
    TopDocs docs = searcher.search(query, hitsPerPage, Sort.INDEXORDER);

    ScoreDoc[] hits = docs.scoreDocs;
    //~

    //Print results
    /*
    System.out.println("Found " + hits.length + " hits:");
    for (ScoreDoc hit : hits) {
       int docId = hit.doc;
       Document d = searcher.doc(docId);
       System.out.printf("\t(%s): %s\n", d.get("number"), d.get("title"));
    }
    */
    //~

}

From source file:com.github.tenorviol.gitsearch.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 *
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the//from  w  w w  . j a  v  a 2 s  .c o  m
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // TODO: make these exclusions configurable
    String fileName = file.getName();
    if (fileName.charAt(0) == '.') {
        return;
    }
    int dotLoc = fileName.lastIndexOf('.');
    String extension = fileName.substring(dotLoc + 1);
    // known binary extensions
    if (extension.equals("jpg") || extension.equals("png") || extension.equals("gif") || extension.equals("pdf")
            || extension.equals("fla") || extension.equals("flv") || extension.equals("swf")
            || extension.equals("swz")) {
        return;
    }

    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new Field("path", file.getPath(), Field.Store.YES,
                        Field.Index.NOT_ANALYZED_NO_NORMS);
                pathField.setIndexOptions(IndexOptions.DOCS_ONLY);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a NumericField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                NumericField modifiedField = new NumericField("modified");
                modifiedField.setLongValue(file.lastModified());
                doc.add(modifiedField);

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so
                    // we use updateDocument instead to replace the old one matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.github.tteofili.looseen.MinHashClassifier.java

License:Apache License

public MinHashClassifier(IndexReader reader, String textField, String categoryField, int min, int hashCount,
        int hashSize) {
    this.min = min;
    this.hashCount = hashCount;
    this.hashSize = hashSize;
    try {//from w  w w  .ja  v  a 2  s  .  c  o m
        Analyzer analyzer = createMinHashAnalyzer(min, hashCount, hashSize);
        IndexWriterConfig config = new IndexWriterConfig(analyzer);
        directory = new RAMDirectory();
        IndexWriter writer = new IndexWriter(directory, config);
        for (int i = 0; i < reader.maxDoc(); i++) {
            Document document = new Document();
            Document d = reader.document(i);
            String textValue = d.getField(textField).stringValue();
            String categoryValue = d.getField(categoryField).stringValue();
            document.add(new TextField(TEXT_FIELD, textValue, Field.Store.NO));
            document.add(new StringField(CLASS_FIELD, categoryValue, Field.Store.YES));
            writer.addDocument(document);
        }
        writer.commit();
        writer.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE);
}

From source file:com.github.tteofili.looseen.Test20NewsgroupsClassification.java

License:Apache License

void buildIndex(File indexDir, IndexWriter indexWriter) throws IOException {
    File[] groupsDir = indexDir.listFiles();
    if (groupsDir != null) {
        for (File group : groupsDir) {
            String groupName = group.getName();
            File[] posts = group.listFiles();
            if (posts != null) {
                for (File postFile : posts) {
                    String number = postFile.getName();
                    NewsPost post = parse(postFile, groupName, number);
                    Document d = new Document();
                    d.add(new StringField(CATEGORY_FIELD, post.getGroup(), Field.Store.YES));
                    d.add(new SortedDocValuesField(CATEGORY_FIELD, new BytesRef(post.getGroup())));
                    d.add(new TextField(SUBJECT_FIELD, post.getSubject(), Field.Store.YES));
                    d.add(new TextField(BODY_FIELD, post.getBody(), Field.Store.YES));
                    indexWriter.addDocument(d);
                }/*w  w  w. ja va 2s  .c o m*/
            }
        }
    }
    indexWriter.commit();
}

From source file:com.github.tteofili.looseen.TestWikipediaClassification.java

License:Apache License

private static void importWikipedia(File dump, IndexWriter indexWriter) throws Exception {
    long start = System.currentTimeMillis();
    int count = 0;
    System.out.format("Importing %s...%n", dump);

    String title = null;//from  www .j av a2 s. c  o m
    String text = null;
    Set<String> cats = new HashSet<>();

    XMLInputFactory factory = XMLInputFactory.newInstance();
    StreamSource source;
    if (dump.getName().endsWith(".xml")) {
        source = new StreamSource(dump);
    } else {
        throw new RuntimeException("can index only wikipedia XML files");
    }
    XMLStreamReader reader = factory.createXMLStreamReader(source);
    while (reader.hasNext()) {
        if (count == Integer.MAX_VALUE) {
            break;
        }
        switch (reader.next()) {
        case XMLStreamConstants.START_ELEMENT:
            if ("title".equals(reader.getLocalName())) {
                title = reader.getElementText();
            } else if (TEXT_FIELD.equals(reader.getLocalName())) {
                text = reader.getElementText();
                Matcher matcher = pattern.matcher(text);
                int pos = 0;
                while (matcher.find(pos)) {
                    String group = matcher.group(1);
                    String catName = group.replaceAll("\\|\\s", "").replaceAll("\\|\\*", "");
                    Collections.addAll(cats, catName.split("\\|"));
                    pos = matcher.end();
                }
            }
            break;
        case XMLStreamConstants.END_ELEMENT:
            if ("page".equals(reader.getLocalName())) {
                Document page = new Document();
                if (title != null) {
                    page.add(new TextField(TITLE_FIELD, title, StoredField.Store.YES));
                }
                if (text != null) {
                    page.add(new TextField(TEXT_FIELD, text, StoredField.Store.YES));
                }
                for (String cat : cats) {
                    page.add(new StringField(CATEGORY_FIELD, cat, Field.Store.YES));
                    page.add(new SortedSetDocValuesField(CATEGORY_FIELD, new BytesRef(cat)));
                }
                indexWriter.addDocument(page);
                cats.clear();
                count++;
                if (count % 100000 == 0) {
                    indexWriter.commit();
                    System.out.format("Committed %d pages%n", count);
                }
            }
            break;
        }
    }

    indexWriter.commit();

    long millis = System.currentTimeMillis() - start;
    System.out.format("Imported %d pages in %d seconds (%.2fms/page)%n", count, millis / 1000,
            (double) millis / count);
}

From source file:com.github.wxiaoqi.search.lucene.LuceneDao.java

License:Open Source License

public void create(IndexObject indexObject) {

    IndexWriter indexWriter = null;
    try {//from   w ww. j  a  v  a 2  s.com
        IndexWriterConfig config = new IndexWriterConfig(this.getAnalyzer());
        indexWriter = new IndexWriter(this.getDirectory(), config);
        indexWriter.addDocument(DocumentUtil.IndexObject2Document(indexObject));
        indexWriter.commit();
    } catch (Exception e) {
        e.printStackTrace();
        try {
            indexWriter.rollback();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
    } finally {
        try {
            indexWriter.close();
        } catch (IOException e1) {
            e1.printStackTrace();
        }
    }
}

From source file:com.globalsight.ling.lucene.Index.java

License:Apache License

public void addDocument(long p_mainId, long p_subId, String p_text) throws IOException {
    synchronized (m_state) {
        if (m_state != STATE_OPENED) {
            throw new IOException("index is not available");
        }//from  w w  w.  j  a  v a2s .  com
    }

    // clean cache if have
    LuceneCache.cleanLuceneCache(m_directory);

    try {
        m_lock.writeLock().acquire();

        IndexWriter tempWriter = null;
        try {
            tempWriter = getIndexWriter(false);
            Document doc = getDocument(p_mainId, p_subId, p_text);
            tempWriter.addDocument(doc);
        } finally {
            m_lock.writeLock().release();
            IOUtils.closeWhileHandlingException(tempWriter);
        }
    } catch (InterruptedException ex) {
        throw new IOException(ex.getMessage());
    }
}

From source file:com.globalsight.ling.tm2.lucene.LuceneIndexWriter.java

License:Apache License

/**
 * Indexes segments. To maintain index integrity, indexes are at
 * first created in memory and merged into a file system index.
 *
 * @param p_tuvs List of BaseTmTuv, SegmentsForSave.AddTuv, or TM3Tuv
 * @param p_sourceLocale true if p_tuvs are source locale segments
 * @param p_indexTargetLocales true for TM3, false for TM2
 *//*from w  w  w . jav a2s  .  co  m*/
public void index(List p_tuvs, boolean p_sourceLocale, boolean p_indexTargetLocales) throws Exception {
    IndexWriterConfig conf = new IndexWriterConfig(LuceneUtil.VERSION, m_analyzer);
    conf.setOpenMode(m_isFirst ? OpenMode.CREATE : OpenMode.CREATE_OR_APPEND);
    IndexWriter fsIndexWriter = new IndexWriter(m_directory, conf);

    try {
        for (Iterator it = p_tuvs.iterator(); it.hasNext();) {
            Object tuv = it.next();

            Document doc = tuv instanceof BaseTmTuv
                    ? createDocumentFromBaseTmTuv((BaseTmTuv) tuv, p_sourceLocale, p_indexTargetLocales)
                    : tuv instanceof AddTuv
                            ? createDocumentFromAddTuv((AddTuv) tuv, p_sourceLocale, p_indexTargetLocales)
                            : tuv instanceof TM3Tuv
                                    ? createDocumentFromTM3Tuv((TM3Tuv<GSTuvData>) tuv, p_sourceLocale,
                                            p_indexTargetLocales)
                                    : null;

            fsIndexWriter.addDocument(doc);
        }
    } finally {
        fsIndexWriter.close();
    }

    // clean cache if have
    LuceneCache.cleanLuceneCache(m_indexDir);
}

From source file:com.gmail.mosoft521.luceneDemo.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * <p/>// ww  w .  ja  va 2s . c om
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * &gt;WriteLineDocTask</a&gt;.
 *
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file   The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents",
                        new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so
                    // we use updateDocument instead to replace the old one matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}