Example usage for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException

Source Link

Document

Adds a document to this index.

Usage

From source file:com.shaie.UTF8Indexing.java

License:Apache License

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Directory dir = new RAMDirectory();
    final StandardAnalyzer analyzer = new StandardAnalyzer();
    final IndexWriterConfig conf = new IndexWriterConfig(analyzer);
    final IndexWriter writer = new IndexWriter(dir, conf);

    final Document doc = new Document();
    doc.add(new TextField("f", "Russia\u2013United States relations", Store.YES));
    writer.addDocument(doc);
    writer.close();//from   w  w w  .  j a  va  2 s .c  o m

    final DirectoryReader reader = DirectoryReader.open(dir);
    final IndexSearcher searcher = new IndexSearcher(reader);
    final QueryParser qp = new QueryParser("f", analyzer);
    search(searcher, qp, "Russia United States relations");
    search(searcher, qp, "\"Russia United states relations\"");
    search(searcher, qp, "\"Russia-United states relations\"");
    search(searcher, qp, "\"Russia\u2013United states relations\"");
    reader.close();

    dir.close();
}

From source file:com.shmsoft.dmass.main.FileProcessor.java

License:Apache License

/**
 * Search metadata and file contents/*from   w  w  w .j  ava2s  .c  o m*/
 *
 * @param metadata
 * @return true if match is found else false
 */
private boolean isResponsive(Metadata metadata) {
    // set true if search finds a match
    boolean isResponsive = false;

    // get culling parameters
    String queryString = Project.getProject().getCullingAsTextBlock();

    // TODO parse important parameters to mappers and reducers individually, not globally
    IndexWriter writer = null;
    RAMDirectory idx = null;
    try {
        // construct a RAMDirectory to hold the in-memory representation of the index.
        idx = new RAMDirectory();

        // make a writer to create the index
        writer = new IndexWriter(idx, new StandardAnalyzer(Version.LUCENE_30), true,
                IndexWriter.MaxFieldLength.UNLIMITED);

        writer.addDocument(createDocument(metadata));

        // optimize and close the writer to finish building the index
        writer.optimize();
        writer.close();

        //adding the build index to FS
        if (Project.getProject().isLuceneFSIndexEnabled() && luceneIndex != null) {
            luceneIndex.addToIndex(idx);
        }

        SolrIndex.getInstance().addBatchData(metadata);

        if (queryString == null || queryString.trim().isEmpty()) {
            return true;
        }

        // build an IndexSearcher using the in-memory index
        Searcher searcher = new IndexSearcher(idx);
        // search directory
        isResponsive = search(searcher, queryString);

        searcher.close();
    } catch (Exception e) {
        // TODO handle this better
        // if anything happens - don't stop processing
        e.printStackTrace(System.out);
    } finally {
        try {
            if (writer != null) {
                writer.close();
            }
            if (idx != null) {
                idx.close();
            }
        } catch (Exception e) {
            // swallow exception, what else can you do now?
        }
    }
    return isResponsive;
}

From source file:com.silverwrist.dynamo.index.IndexServiceImpl.java

License:Mozilla Public License

public void addItem(String item_namespace, String item_name, Object item, String scope, java.util.Date date,
        DynamoUser owner, String text) throws IndexException {
    // Create a new Lucene Document containing the item information.
    Document doc = new Document();
    doc.add(Field.Keyword("id", createTag(item_namespace, item_name, item)));
    doc.add(Field.Keyword("date", date));
    doc.add(Field.Keyword("owner", owner.getName()));
    doc.add(Field.Keyword("scope", scope));
    doc.add(Field.UnStored("text", text));

    try { // Use an IndexWriter to write it to the index.
        IndexWriter iwr = new IndexWriter(m_directory, m_analyzer, false);
        iwr.addDocument(doc);
        iwr.close();/*from  w  ww.  java 2  s.c  om*/

    } // end try
    catch (IOException e) { // translate Lucene's IOException here
        IndexException ie = new IndexException(IndexServiceImpl.class, "IndexMessages", "addItem.fail", e);
        ie.setParameter(0, item_namespace);
        ie.setParameter(1, item_name);
        ie.setParameter(2, m_identity.toString());
        throw ie;

    } // end catch

}

From source file:com.slieer.app.lecene3x.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is
 * given, recurses over files and directories found under the given
 * directory./*from  w  w  w . ja  v a2 s. co m*/
 * 
 * NOTE: This method indexes one document per input file. This is slow. For
 * good throughput, put multiple documents into your input file(s). An
 * example of this is in the benchmark module, which can create "line doc"
 * files, one document per line, using the <a href=
 * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 * 
 * @param writer
 *            Writer to the index where the given file/dir info will be
 *            stored
 * @param file
 *            The file to index, or the directory to recurse into to find
 *            files to index
 * @throws IOException
 *             If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this
                // exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path". Use a
                // field that is indexed (i.e. searchable), but don't
                // tokenize
                // the field into separate words and don't index term
                // frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named
                // "modified".
                // Use a LongField that is indexed (i.e. efficiently
                // filterable with
                // NumericRangeFilter). This indexes to milli-second
                // resolution, which
                // is often too fine. You could instead create a number
                // based on
                // year/month/day/hour/minutes/seconds, down the resolution
                // you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".
                // Specify a Reader,
                // so that the text of the file is tokenized and indexed,
                // but not stored.
                // Note that FileReader expects the file to be in UTF-8
                // encoding.
                // If that's not the case searching for special characters
                // will fail.
                doc.add(new TextField("contents",
                        new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been indexed) so
                    // we use updateDocument instead to replace the old one
                    // matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.slieer.app.lecene3x.LuceneIndexAndSearchDemo.java

License:Apache License

/**
 *  ???/*w w  w .  j a v  a2 s. co m*/
 * 
 * @param args
 */
public static void main(String[] args) {
    // Lucene Document??
    String fieldName = "text";
    // 
    String text = "IK Analyzer???????";
    String text1 = "? (Chinese Word Segmentation) ???????????";
    String text2 = "?????,,??,?";

    // IKAnalyzer?
    Analyzer analyzer = new IKAnalyzer(true);

    Directory directory = null;
    IndexWriter iwriter = null;
    IndexReader ireader = null;
    IndexSearcher isearcher = null;
    try {
        // 
        directory = new RAMDirectory();

        // ?IndexWriterConfig
        IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
        iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
        iwriter = new IndexWriter(directory, iwConfig);
        // 
        Document doc = new Document();
        //document.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED));
        Field strField = new StringField("ID", "10000", Field.Store.YES);
        Field textFild = new StringField(fieldName, text, Field.Store.YES);
        //textFild.setBoost(2);

        doc.add(strField);
        doc.add(textFild);
        iwriter.addDocument(doc);

        doc = new Document();
        strField = new StringField("ID", "10001", Field.Store.YES);
        textFild = new StringField(fieldName, text1, Field.Store.YES);
        //strField.setBoost(1);
        doc.add(strField);
        doc.add(textFild);
        iwriter.addDocument(doc);

        doc = new Document();
        strField = new StringField("ID", "10002", Field.Store.YES);
        //            textFild = new TextField(fieldName, text2, Field.Store.YES);
        textFild = new StringField(fieldName, text2, Field.Store.YES);
        //strField.setBoost(1);
        doc.add(strField);
        doc.add(textFild);
        iwriter.addDocument(doc);

        iwriter.close();

        // ?**********************************
        // ?
        ireader = DirectoryReader.open(directory);
        isearcher = new IndexSearcher(ireader);

        String keyword = "?";
        // QueryParser?Query
        QueryParser qp = new QueryParser(Version.LUCENE_4_9, fieldName, analyzer);
        qp.setDefaultOperator(QueryParser.AND_OPERATOR);
        Query query = qp.parse(keyword);
        System.out.println("Query = " + query);

        // ?5?
        TopDocs topDocs = isearcher.search(query, 5);
        System.out.println("" + topDocs.totalHits);
        // 
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (int i = 0; i < topDocs.totalHits; i++) {
            Document targetDoc = isearcher.doc(scoreDocs[i].doc);
            System.out.println("" + targetDoc.toString());
        }

    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (LockObtainFailedException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (ParseException e) {
        e.printStackTrace();
    } finally {
        if (ireader != null) {
            try {
                ireader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        if (directory != null) {
            try {
                directory.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:com.slieer.lucene.apachedemo.IndexFiles.java

License:Apache License

static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }//from  w ww. j a  va2  s.c o m
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this
                // exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path". Use a
                // field that is indexed (i.e. searchable), but don't
                // tokenize
                // the field into separate words and don't index term
                // frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named
                // "modified".
                // Use a LongField that is indexed (i.e. efficiently
                // filterable with
                // NumericRangeFilter). This indexes to milli-second
                // resolution, which
                // is often too fine. You could instead create a number
                // based on
                // year/month/day/hour/minutes/seconds, down the resolution
                // you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".
                // Specify a Reader,
                // so that the text of the file is tokenized and indexed,
                // but not stored.
                // Note that FileReader expects the file to be in UTF-8
                // encoding.
                // If that's not the case searching for special characters
                // will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been indexed) so
                    // we use updateDocument instead to replace the old one
                    // matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.soebes.supose.core.lucene.LuceneTest.java

License:Open Source License

@BeforeClass
public void beforeClass() throws CorruptIndexException, LockObtainFailedException, IOException {
    Analyzer analyzer = AnalyzerFactory.createInstance();

    // To store an index on disk, use this instead:
    // Directory directory = FSDirectory.getDirectory("/tmp/testindex");
    IndexWriter iwriter = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
    iwriter.setMaxFieldLength(25000);/* w  ww.j ava2s.  c  om*/

    Document doc = new Document();
    String text = "This is the text to be indexed.";
    addUnTokenizedField(doc, FieldNames.REVISION.getValue(), NumberUtils.pad(1));
    addTokenizedField(doc, FieldNames.CONTENTS.getValue(), text);
    addUnTokenizedField(doc, FieldNames.FILENAME.getValue(), "/trunk/doc/testXML.doc");
    iwriter.addDocument(doc);

    doc = new Document();
    text = "This is different text.";
    addUnTokenizedField(doc, FieldNames.REVISION.getValue(), NumberUtils.pad(2));
    addTokenizedField(doc, FieldNames.CONTENTS.getValue(), text);
    addUnTokenizedField(doc, FieldNames.FILENAME.getValue(), "/tags/docs/XYZabc.java");
    iwriter.addDocument(doc);

    doc = new Document();
    text = "This is more different text.";
    addUnTokenizedField(doc, FieldNames.REVISION.getValue(), NumberUtils.pad(3));
    addTokenizedField(doc, FieldNames.CONTENTS.getValue(), text);
    addUnTokenizedField(doc, FieldNames.FILENAME.getValue(), "/tags/docs/SCMPlan.doc");
    iwriter.addDocument(doc);

    doc = new Document();
    text = "This is the third text.";
    addUnTokenizedField(doc, FieldNames.REVISION.getValue(), NumberUtils.pad(4));
    addTokenizedField(doc, FieldNames.CONTENTS.getValue(), text);
    addUnTokenizedField(doc, FieldNames.FILENAME.getValue(), "/trunk/subdir/elviraXML.doc");
    iwriter.addDocument(doc);

    iwriter.optimize();
    iwriter.close();

    isearcher = new IndexSearcher(directory);
}

From source file:com.soebes.supose.core.scan.IndexMergeTest.java

License:Open Source License

public void testIndex1() throws Exception {
    Index index = new Index();
    IndexWriter indexWriter = index.createIndexWriter("index1");
    Document doc = new Document();
    addTokenizedField(doc, "revision", "1");
    addTokenizedField(doc, "revision", "2");
    indexWriter.addDocument(doc);
    indexWriter.close();/*  w  w w.  jav a2s .  c  om*/
}

From source file:com.soebes.supose.core.scan.IndexMergeTest.java

License:Open Source License

public void testIndex2() throws Exception {
    Index index = new Index();
    IndexWriter indexWriter = index.createIndexWriter("index2");
    Document doc = new Document();
    addTokenizedField(doc, "revision", "3");
    addTokenizedField(doc, "revision", "4");
    indexWriter.addDocument(doc);
    indexWriter.close();/* w w  w  .  ja va  2  s.  co  m*/
}

From source file:com.soebes.supose.core.scan.ScanRepository.java

License:Open Source License

/**
 * The method will index a particular document (file) into the Lucene index.
 * It will store the majority of the information about a file into the
 * Lucene index like revision, copyfrom, path, filename etc.
 *
 * @param doc/*w  w  w . jav a  2  s  . c  o  m*/
 * @param indexWriter
 * @param dirEntry
 * @param repository
 * @param logEntry
 * @param entryPath
 * @throws SVNException
 * @throws IOException
 */
private void indexFile(RevisionDocument indexRevision, IndexWriter indexWriter, SVNDirEntry dirEntry,
        SVNLogEntry logEntry, SVNLogEntryPath entryPath) throws SVNException, IOException {
    SVNProperties fileProperties = new SVNProperties();

    SVNNodeKind nodeKind = null;
    // if the entry has been deleted we will check the information about the
    // entry
    // via the revision before...
    LOGGER.debug("Before checking...");
    nodeKind = repository.getRepository().checkPath(entryPath.getPath(), logEntry.getRevision());
    LOGGER.debug("After checking...");

    indexRevision.addUnTokenizedField(FieldNames.REVISION, NumberUtils.pad(logEntry.getRevision()));

    boolean isDir = nodeKind == SVNNodeKind.DIR;
    boolean isFile = nodeKind == SVNNodeKind.FILE;
    FileName fileName = null;
    if (isDir) {
        LOGGER.debug("The " + entryPath.getPath() + " is a directory entry.");
        indexRevision.addUnTokenizedField(FieldNames.NODE, "dir");
        fileName = new FileName(entryPath.getPath(), true);

        if (getFiltering().ignorePath(fileName.getPath())) {
            // Ignore the path...
            if (LOGGER.isDebugEnabled()) {
                LOGGER.debug("The following " + fileName.getPath()
                        + " is beeing ignored based on filtering (ignorePath()).");
            }
            return;
        }

    } else if (isFile) {
        LOGGER.debug("The " + entryPath.getPath() + " is a file entry.");
        indexRevision.addUnTokenizedField(FieldNames.NODE, "file");
        fileName = new FileName(entryPath.getPath(), false);

        if (getFiltering().ignoreFilename(fileName.getBaseName())) {
            if (LOGGER.isDebugEnabled()) {
                LOGGER.debug("The following " + fileName.getBaseName()
                        + " is beeing ignored based on filtering (ignoreFilename()).");
            }
            // Ignore filename
            return;
        }

    } else {
        // This means a file/directory has been deleted.
        indexRevision.addUnTokenizedField(FieldNames.NODE, "unknown");
        LOGGER.debug("The " + entryPath.getPath() + " is an unknown entry.");

        // We would like to know what is has been?
        // Directory? File? So we go a step back in History...
        long rev = logEntry.getRevision() - 1;
        SVNNodeKind nodeKindUnknown = getRepository().getRepository().checkPath(entryPath.getPath(), rev);
        LOGGER.debug("NodeKind(" + rev + "): " + nodeKindUnknown.toString());
        fileName = new FileName(entryPath.getPath(), nodeKindUnknown == SVNNodeKind.DIR);
    }

    if (LOGGER.isDebugEnabled()) {
        LOGGER.debug(
                "FileNameCheck: entryPath   -> kind:" + nodeKind.toString() + " path:" + entryPath.getPath());
        LOGGER.debug("FileNameCheck:                path:'" + fileName.getPath() + "' filename:'"
                + fileName.getBaseName() + "'");
    }

    // TODO: We have to check if we need to set localization
    indexRevision.addUnTokenizedFieldNoStore(FieldNames.PATH, fileName.getPath().toLowerCase());
    indexRevision.addUnTokenizedField(FieldNames.PATH, fileName.getPath());

    // Does a copy operation took place...
    if (entryPath.getCopyPath() != null) {
        indexRevision.addUnTokenizedField(FieldNames.FROM, entryPath.getCopyPath());
        indexRevision.addUnTokenizedField(FieldNames.FROMREV, entryPath.getCopyRevision());
    }

    // The field we use for searching is stored as lowercase.
    // TODO: We have to check if we need to set localization
    indexRevision.addUnTokenizedFieldNoStore(FieldNames.FILENAME, fileName.getBaseName().toLowerCase());
    indexRevision.addUnTokenizedField(FieldNames.FILENAME, fileName.getBaseName());

    indexRevision.addUnTokenizedField(FieldNames.AUTHOR,
            logEntry.getAuthor() == null ? "" : logEntry.getAuthor());

    // We will add the message as tokenized field to be able to search
    // within the log messages.
    indexRevision.addTokenizedField(FieldNames.MESSAGE,
            logEntry.getMessage() == null ? "" : logEntry.getMessage());
    indexRevision.addUnTokenizedField(FieldNames.DATE, logEntry.getDate());

    indexRevision.addUnTokenizedField(FieldNames.KIND, String.valueOf(entryPath.getType()).toLowerCase());

    // TODO: May be don't need this if we use repository name?
    indexRevision.addUnTokenizedField(FieldNames.REPOSITORYUUID,
            getRepository().getRepository().getRepositoryUUID(false));

    indexRevision.addUnTokenizedField(FieldNames.REPOSITORY, getName());

    if (nodeKind == SVNNodeKind.NONE) {
        LOGGER.debug("The " + entryPath.getPath() + " is a NONE entry.");
    } else if (nodeKind == SVNNodeKind.DIR) {
        // The given entry is a directory.
        LOGGER.debug("The " + entryPath.getPath() + " is a directory.");
        // Here we need to call getDir to get directory properties.
        Collection<SVNDirEntry> dirEntries = null;
        getRepository().getRepository().getDir(entryPath.getPath(), logEntry.getRevision(), fileProperties,
                dirEntries);
        indexProperties(fileProperties, indexRevision);

    } else if (nodeKind == SVNNodeKind.FILE) {

        // The given entry is a file.
        // This means we will get every file from the repository....
        // Get only the properties of the file

        indexRevision.addTokenizedField(FieldNames.SIZE, Long.toString(dirEntry.getSize()));
        getRepository().getRepository().getFile(entryPath.getPath(), logEntry.getRevision(), fileProperties,
                null);
        indexProperties(fileProperties, indexRevision);

        FileExtensionHandler feh = new FileExtensionHandler();
        feh.setFileProperties(fileProperties);
        feh.setDoc(indexRevision);
        feh.execute(getRepository(), dirEntry, entryPath.getPath(), logEntry.getRevision());
    }

    indexWriter.addDocument(indexRevision.getDoc());
    LOGGER.debug("File " + entryPath.getPath() + " indexed...");
}