Example usage for org.apache.lucene.index IndexWriter addDocument

List of usage examples for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException 

Source Link

Document

Adds a document to this index.

Usage

From source file:com.shaie.UTF8Indexing.java

License:Apache License

@SuppressWarnings("resource")
public static void main(String[] args) throws Exception {
    final Directory dir = new RAMDirectory();
    final StandardAnalyzer analyzer = new StandardAnalyzer();
    final IndexWriterConfig conf = new IndexWriterConfig(analyzer);
    final IndexWriter writer = new IndexWriter(dir, conf);

    final Document doc = new Document();
    doc.add(new TextField("f", "Russia\u2013United States relations", Store.YES));
    writer.addDocument(doc);
    writer.close();//from   w  w w  .  j a  va  2 s .c  o m

    final DirectoryReader reader = DirectoryReader.open(dir);
    final IndexSearcher searcher = new IndexSearcher(reader);
    final QueryParser qp = new QueryParser("f", analyzer);
    search(searcher, qp, "Russia United States relations");
    search(searcher, qp, "\"Russia United states relations\"");
    search(searcher, qp, "\"Russia-United states relations\"");
    search(searcher, qp, "\"Russia\u2013United states relations\"");
    reader.close();

    dir.close();
}

From source file:com.shmsoft.dmass.main.FileProcessor.java

License:Apache License

/**
 * Search metadata and file contents/*from   w  w  w .j  ava2s  .c  o m*/
 *
 * @param metadata
 * @return true if match is found else false
 */
private boolean isResponsive(Metadata metadata) {
    // set true if search finds a match
    boolean isResponsive = false;

    // get culling parameters
    String queryString = Project.getProject().getCullingAsTextBlock();

    // TODO parse important parameters to mappers and reducers individually, not globally
    IndexWriter writer = null;
    RAMDirectory idx = null;
    try {
        // construct a RAMDirectory to hold the in-memory representation of the index.
        idx = new RAMDirectory();

        // make a writer to create the index
        writer = new IndexWriter(idx, new StandardAnalyzer(Version.LUCENE_30), true,
                IndexWriter.MaxFieldLength.UNLIMITED);

        writer.addDocument(createDocument(metadata));

        // optimize and close the writer to finish building the index
        writer.optimize();
        writer.close();

        //adding the build index to FS
        if (Project.getProject().isLuceneFSIndexEnabled() && luceneIndex != null) {
            luceneIndex.addToIndex(idx);
        }

        SolrIndex.getInstance().addBatchData(metadata);

        if (queryString == null || queryString.trim().isEmpty()) {
            return true;
        }

        // build an IndexSearcher using the in-memory index
        Searcher searcher = new IndexSearcher(idx);
        // search directory
        isResponsive = search(searcher, queryString);

        searcher.close();
    } catch (Exception e) {
        // TODO handle this better
        // if anything happens - don't stop processing
        e.printStackTrace(System.out);
    } finally {
        try {
            if (writer != null) {
                writer.close();
            }
            if (idx != null) {
                idx.close();
            }
        } catch (Exception e) {
            // swallow exception, what else can you do now?
        }
    }
    return isResponsive;
}

From source file:com.silverwrist.dynamo.index.IndexServiceImpl.java

License:Mozilla Public License

public void addItem(String item_namespace, String item_name, Object item, String scope, java.util.Date date,
        DynamoUser owner, String text) throws IndexException {
    // Create a new Lucene Document containing the item information.
    Document doc = new Document();
    doc.add(Field.Keyword("id", createTag(item_namespace, item_name, item)));
    doc.add(Field.Keyword("date", date));
    doc.add(Field.Keyword("owner", owner.getName()));
    doc.add(Field.Keyword("scope", scope));
    doc.add(Field.UnStored("text", text));

    try { // Use an IndexWriter to write it to the index.
        IndexWriter iwr = new IndexWriter(m_directory, m_analyzer, false);
        iwr.addDocument(doc);
        iwr.close();/*from  w  ww.  java 2  s.c  om*/

    } // end try
    catch (IOException e) { // translate Lucene's IOException here
        IndexException ie = new IndexException(IndexServiceImpl.class, "IndexMessages", "addItem.fail", e);
        ie.setParameter(0, item_namespace);
        ie.setParameter(1, item_name);
        ie.setParameter(2, m_identity.toString());
        throw ie;

    } // end catch

}

From source file:com.slieer.app.lecene3x.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is
 * given, recurses over files and directories found under the given
 * directory./*from  w  w  w . ja  v a2 s. co m*/
 * 
 * NOTE: This method indexes one document per input file. This is slow. For
 * good throughput, put multiple documents into your input file(s). An
 * example of this is in the benchmark module, which can create "line doc"
 * files, one document per line, using the <a href=
 * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 * 
 * @param writer
 *            Writer to the index where the given file/dir info will be
 *            stored
 * @param file
 *            The file to index, or the directory to recurse into to find
 *            files to index
 * @throws IOException
 *             If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this
                // exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path". Use a
                // field that is indexed (i.e. searchable), but don't
                // tokenize
                // the field into separate words and don't index term
                // frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named
                // "modified".
                // Use a LongField that is indexed (i.e. efficiently
                // filterable with
                // NumericRangeFilter). This indexes to milli-second
                // resolution, which
                // is often too fine. You could instead create a number
                // based on
                // year/month/day/hour/minutes/seconds, down the resolution
                // you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".
                // Specify a Reader,
                // so that the text of the file is tokenized and indexed,
                // but not stored.
                // Note that FileReader expects the file to be in UTF-8
                // encoding.
                // If that's not the case searching for special characters
                // will fail.
                doc.add(new TextField("contents",
                        new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been indexed) so
                    // we use updateDocument instead to replace the old one
                    // matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.slieer.app.lecene3x.LuceneIndexAndSearchDemo.java

License:Apache License

/**
 *  ???/*w w  w .  j a v  a2 s. co m*/
 * 
 * @param args
 */
public static void main(String[] args) {
    // Lucene Document??
    String fieldName = "text";
    // 
    String text = "IK Analyzer???????";
    String text1 = "? (Chinese Word Segmentation) ???????????";
    String text2 = "?????,,??,?";

    // IKAnalyzer?
    Analyzer analyzer = new IKAnalyzer(true);

    Directory directory = null;
    IndexWriter iwriter = null;
    IndexReader ireader = null;
    IndexSearcher isearcher = null;
    try {
        // 
        directory = new RAMDirectory();

        // ?IndexWriterConfig
        IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
        iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
        iwriter = new IndexWriter(directory, iwConfig);
        // 
        Document doc = new Document();
        //document.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED));
        Field strField = new StringField("ID", "10000", Field.Store.YES);
        Field textFild = new StringField(fieldName, text, Field.Store.YES);
        //textFild.setBoost(2);

        doc.add(strField);
        doc.add(textFild);
        iwriter.addDocument(doc);

        doc = new Document();
        strField = new StringField("ID", "10001", Field.Store.YES);
        textFild = new StringField(fieldName, text1, Field.Store.YES);
        //strField.setBoost(1);
        doc.add(strField);
        doc.add(textFild);
        iwriter.addDocument(doc);

        doc = new Document();
        strField = new StringField("ID", "10002", Field.Store.YES);
        //            textFild = new TextField(fieldName, text2, Field.Store.YES);
        textFild = new StringField(fieldName, text2, Field.Store.YES);
        //strField.setBoost(1);
        doc.add(strField);
        doc.add(textFild);
        iwriter.addDocument(doc);

        iwriter.close();

        // ?**********************************
        // ?
        ireader = DirectoryReader.open(directory);
        isearcher = new IndexSearcher(ireader);

        String keyword = "?";
        // QueryParser?Query
        QueryParser qp = new QueryParser(Version.LUCENE_4_9, fieldName, analyzer);
        qp.setDefaultOperator(QueryParser.AND_OPERATOR);
        Query query = qp.parse(keyword);
        System.out.println("Query = " + query);

        // ?5?
        TopDocs topDocs = isearcher.search(query, 5);
        System.out.println("" + topDocs.totalHits);
        // 
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (int i = 0; i < topDocs.totalHits; i++) {
            Document targetDoc = isearcher.doc(scoreDocs[i].doc);
            System.out.println("" + targetDoc.toString());
        }

    } catch (CorruptIndexException e) {
        e.printStackTrace();
    } catch (LockObtainFailedException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    } catch (ParseException e) {
        e.printStackTrace();
    } finally {
        if (ireader != null) {
            try {
                ireader.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        if (directory != null) {
            try {
                directory.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

From source file:com.slieer.lucene.apachedemo.IndexFiles.java

License:Apache License

static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }//from  w ww. j a  va2  s.c o m
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this
                // exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path". Use a
                // field that is indexed (i.e. searchable), but don't
                // tokenize
                // the field into separate words and don't index term
                // frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named
                // "modified".
                // Use a LongField that is indexed (i.e. efficiently
                // filterable with
                // NumericRangeFilter). This indexes to milli-second
                // resolution, which
                // is often too fine. You could instead create a number
                // based on
                // year/month/day/hour/minutes/seconds, down the resolution
                // you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".
                // Specify a Reader,
                // so that the text of the file is tokenized and indexed,
                // but not stored.
                // Note that FileReader expects the file to be in UTF-8
                // encoding.
                // If that's not the case searching for special characters
                // will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been indexed) so
                    // we use updateDocument instead to replace the old one
                    // matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.soebes.supose.core.lucene.LuceneTest.java

License:Open Source License

@BeforeClass
public void beforeClass() throws CorruptIndexException, LockObtainFailedException, IOException {
    Analyzer analyzer = AnalyzerFactory.createInstance();

    // To store an index on disk, use this instead:
    // Directory directory = FSDirectory.getDirectory("/tmp/testindex");
    IndexWriter iwriter = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
    iwriter.setMaxFieldLength(25000);/* w  ww.j ava2s.  c  om*/

    Document doc = new Document();
    String text = "This is the text to be indexed.";
    addUnTokenizedField(doc, FieldNames.REVISION.getValue(), NumberUtils.pad(1));
    addTokenizedField(doc, FieldNames.CONTENTS.getValue(), text);
    addUnTokenizedField(doc, FieldNames.FILENAME.getValue(), "/trunk/doc/testXML.doc");
    iwriter.addDocument(doc);

    doc = new Document();
    text = "This is different text.";
    addUnTokenizedField(doc, FieldNames.REVISION.getValue(), NumberUtils.pad(2));
    addTokenizedField(doc, FieldNames.CONTENTS.getValue(), text);
    addUnTokenizedField(doc, FieldNames.FILENAME.getValue(), "/tags/docs/XYZabc.java");
    iwriter.addDocument(doc);

    doc = new Document();
    text = "This is more different text.";
    addUnTokenizedField(doc, FieldNames.REVISION.getValue(), NumberUtils.pad(3));
    addTokenizedField(doc, FieldNames.CONTENTS.getValue(), text);
    addUnTokenizedField(doc, FieldNames.FILENAME.getValue(), "/tags/docs/SCMPlan.doc");
    iwriter.addDocument(doc);

    doc = new Document();
    text = "This is the third text.";
    addUnTokenizedField(doc, FieldNames.REVISION.getValue(), NumberUtils.pad(4));
    addTokenizedField(doc, FieldNames.CONTENTS.getValue(), text);
    addUnTokenizedField(doc, FieldNames.FILENAME.getValue(), "/trunk/subdir/elviraXML.doc");
    iwriter.addDocument(doc);

    iwriter.optimize();
    iwriter.close();

    isearcher = new IndexSearcher(directory);
}

From source file:com.soebes.supose.core.scan.IndexMergeTest.java

License:Open Source License

public void testIndex1() throws Exception {
    Index index = new Index();
    IndexWriter indexWriter = index.createIndexWriter("index1");
    Document doc = new Document();
    addTokenizedField(doc, "revision", "1");
    addTokenizedField(doc, "revision", "2");
    indexWriter.addDocument(doc);
    indexWriter.close();/*  w  w w.  jav a2s .  c  om*/
}

From source file:com.soebes.supose.core.scan.IndexMergeTest.java

License:Open Source License

public void testIndex2() throws Exception {
    Index index = new Index();
    IndexWriter indexWriter = index.createIndexWriter("index2");
    Document doc = new Document();
    addTokenizedField(doc, "revision", "3");
    addTokenizedField(doc, "revision", "4");
    indexWriter.addDocument(doc);
    indexWriter.close();/* w w  w  .  ja va  2  s.  co  m*/
}

From source file:com.soebes.supose.core.scan.ScanRepository.java

License:Open Source License

/**
 * The method will index a particular document (file) into the Lucene index.
 * It will store the majority of the information about a file into the
 * Lucene index like revision, copyfrom, path, filename etc.
 *
 * @param doc/*w  w  w . jav a  2  s  . c  o  m*/
 * @param indexWriter
 * @param dirEntry
 * @param repository
 * @param logEntry
 * @param entryPath
 * @throws SVNException
 * @throws IOException
 */
private void indexFile(RevisionDocument indexRevision, IndexWriter indexWriter, SVNDirEntry dirEntry,
        SVNLogEntry logEntry, SVNLogEntryPath entryPath) throws SVNException, IOException {
    SVNProperties fileProperties = new SVNProperties();

    SVNNodeKind nodeKind = null;
    // if the entry has been deleted we will check the information about the
    // entry
    // via the revision before...
    LOGGER.debug("Before checking...");
    nodeKind = repository.getRepository().checkPath(entryPath.getPath(), logEntry.getRevision());
    LOGGER.debug("After checking...");

    indexRevision.addUnTokenizedField(FieldNames.REVISION, NumberUtils.pad(logEntry.getRevision()));

    boolean isDir = nodeKind == SVNNodeKind.DIR;
    boolean isFile = nodeKind == SVNNodeKind.FILE;
    FileName fileName = null;
    if (isDir) {
        LOGGER.debug("The " + entryPath.getPath() + " is a directory entry.");
        indexRevision.addUnTokenizedField(FieldNames.NODE, "dir");
        fileName = new FileName(entryPath.getPath(), true);

        if (getFiltering().ignorePath(fileName.getPath())) {
            // Ignore the path...
            if (LOGGER.isDebugEnabled()) {
                LOGGER.debug("The following " + fileName.getPath()
                        + " is beeing ignored based on filtering (ignorePath()).");
            }
            return;
        }

    } else if (isFile) {
        LOGGER.debug("The " + entryPath.getPath() + " is a file entry.");
        indexRevision.addUnTokenizedField(FieldNames.NODE, "file");
        fileName = new FileName(entryPath.getPath(), false);

        if (getFiltering().ignoreFilename(fileName.getBaseName())) {
            if (LOGGER.isDebugEnabled()) {
                LOGGER.debug("The following " + fileName.getBaseName()
                        + " is beeing ignored based on filtering (ignoreFilename()).");
            }
            // Ignore filename
            return;
        }

    } else {
        // This means a file/directory has been deleted.
        indexRevision.addUnTokenizedField(FieldNames.NODE, "unknown");
        LOGGER.debug("The " + entryPath.getPath() + " is an unknown entry.");

        // We would like to know what is has been?
        // Directory? File? So we go a step back in History...
        long rev = logEntry.getRevision() - 1;
        SVNNodeKind nodeKindUnknown = getRepository().getRepository().checkPath(entryPath.getPath(), rev);
        LOGGER.debug("NodeKind(" + rev + "): " + nodeKindUnknown.toString());
        fileName = new FileName(entryPath.getPath(), nodeKindUnknown == SVNNodeKind.DIR);
    }

    if (LOGGER.isDebugEnabled()) {
        LOGGER.debug(
                "FileNameCheck: entryPath   -> kind:" + nodeKind.toString() + " path:" + entryPath.getPath());
        LOGGER.debug("FileNameCheck:                path:'" + fileName.getPath() + "' filename:'"
                + fileName.getBaseName() + "'");
    }

    // TODO: We have to check if we need to set localization
    indexRevision.addUnTokenizedFieldNoStore(FieldNames.PATH, fileName.getPath().toLowerCase());
    indexRevision.addUnTokenizedField(FieldNames.PATH, fileName.getPath());

    // Does a copy operation took place...
    if (entryPath.getCopyPath() != null) {
        indexRevision.addUnTokenizedField(FieldNames.FROM, entryPath.getCopyPath());
        indexRevision.addUnTokenizedField(FieldNames.FROMREV, entryPath.getCopyRevision());
    }

    // The field we use for searching is stored as lowercase.
    // TODO: We have to check if we need to set localization
    indexRevision.addUnTokenizedFieldNoStore(FieldNames.FILENAME, fileName.getBaseName().toLowerCase());
    indexRevision.addUnTokenizedField(FieldNames.FILENAME, fileName.getBaseName());

    indexRevision.addUnTokenizedField(FieldNames.AUTHOR,
            logEntry.getAuthor() == null ? "" : logEntry.getAuthor());

    // We will add the message as tokenized field to be able to search
    // within the log messages.
    indexRevision.addTokenizedField(FieldNames.MESSAGE,
            logEntry.getMessage() == null ? "" : logEntry.getMessage());
    indexRevision.addUnTokenizedField(FieldNames.DATE, logEntry.getDate());

    indexRevision.addUnTokenizedField(FieldNames.KIND, String.valueOf(entryPath.getType()).toLowerCase());

    // TODO: May be don't need this if we use repository name?
    indexRevision.addUnTokenizedField(FieldNames.REPOSITORYUUID,
            getRepository().getRepository().getRepositoryUUID(false));

    indexRevision.addUnTokenizedField(FieldNames.REPOSITORY, getName());

    if (nodeKind == SVNNodeKind.NONE) {
        LOGGER.debug("The " + entryPath.getPath() + " is a NONE entry.");
    } else if (nodeKind == SVNNodeKind.DIR) {
        // The given entry is a directory.
        LOGGER.debug("The " + entryPath.getPath() + " is a directory.");
        // Here we need to call getDir to get directory properties.
        Collection<SVNDirEntry> dirEntries = null;
        getRepository().getRepository().getDir(entryPath.getPath(), logEntry.getRevision(), fileProperties,
                dirEntries);
        indexProperties(fileProperties, indexRevision);

    } else if (nodeKind == SVNNodeKind.FILE) {

        // The given entry is a file.
        // This means we will get every file from the repository....
        // Get only the properties of the file

        indexRevision.addTokenizedField(FieldNames.SIZE, Long.toString(dirEntry.getSize()));
        getRepository().getRepository().getFile(entryPath.getPath(), logEntry.getRevision(), fileProperties,
                null);
        indexProperties(fileProperties, indexRevision);

        FileExtensionHandler feh = new FileExtensionHandler();
        feh.setFileProperties(fileProperties);
        feh.setDoc(indexRevision);
        feh.execute(getRepository(), dirEntry, entryPath.getPath(), logEntry.getRevision());
    }

    indexWriter.addDocument(indexRevision.getDoc());
    LOGGER.debug("File " + entryPath.getPath() + " indexed...");
}