Example usage for org.apache.lucene.document TextField TextField

List of usage examples for org.apache.lucene.document TextField TextField

Introduction

In this page you can find the example usage for org.apache.lucene.document TextField TextField.

Prototype

public TextField(String name, TokenStream stream) 

Source Link

Document

Creates a new un-stored TextField with TokenStream value.

Usage

From source file:FileIndexer.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);//  w  ww. j a v  a2 s.c om

        // Add the last modified date of the file a field named "modified".
        // Use a LongField that is indexed (i.e. efficiently filterable with
        // NumericRangeFilter).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongField("modified", lastModified, Field.Store.NO));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:DocIndexer.java

License:Apache License

private RAMDirectory index() throws IOException, UnsupportedEncodingException, FileNotFoundException {
    RAMDirectory directory = new RAMDirectory();
    IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer(CharArraySet.EMPTY_SET));
    config.setOpenMode(OpenMode.CREATE);
    config.setCommitOnClose(true);/*from   w  w  w  . j  a v  a2s . co m*/
    try (IndexWriter iwriter = new IndexWriter(directory, config)) {
        for (String inputFile : inputFiles) {
            File file = new File(inputFile);
            if (file.length() == 0) {
                continue;
            }

            String title;
            try (BufferedReader titleReader = new BufferedReader(
                    new InputStreamReader(new FileInputStream(file), "UTF-8"))) {
                title = titleReader.readLine();
                if (title != null && title.startsWith("[[")) {
                    // Generally the first line of the txt is the title. In a few cases the
                    // first line is a "[[tag]]" and the second line is the title.
                    title = titleReader.readLine();
                }
            }
            Matcher matcher = SECTION_HEADER.matcher(title);
            if (matcher.matches()) {
                title = matcher.group(1);
            }

            String outputFile = AsciiDoctor.mapInFileToOutFile(inputFile, inExt, outExt);
            try (FileReader reader = new FileReader(file)) {
                Document doc = new Document();
                doc.add(new TextField(Constants.DOC_FIELD, reader));
                doc.add(new StringField(Constants.URL_FIELD, prefix + outputFile, Field.Store.YES));
                doc.add(new TextField(Constants.TITLE_FIELD, title, Field.Store.YES));
                iwriter.addDocument(doc);
            }
        }
    }
    return directory;
}

From source file:back.Indexer.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * /*from w  ww.j  a v a2  s  . c o  m*/
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:book.Indexer.java

License:Apache License

protected Document getDocument(File f) throws Exception {

    Document doc = new Document();
    doc.add(new TextField("contents", new FileReader(f)));
    // was: doc.add(new Field("contents", new FileReader(f))); //7
    doc.add(new StringField("filename", f.getName(), Field.Store.YES));
    /* was: doc.add(new Field("filename", f.getName(), Field.Store.YES,
       Field.Index.NOT_ANALYZED));//8 */
    doc.add(new StringField("fullpath", f.getCanonicalPath(), Field.Store.YES));
    /* was: doc.add(new Field("fullpath", f.getCanonicalPath(),
       Field.Store.YES, Field.Index.NOT_ANALYZED));// 9 */
    return doc;//from   w ww  .j  ava  2 s. c  o  m
}

From source file:br.andrew.lucene.testing.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * /*from   w w w.j  a  v a  2  s.  c  o m*/
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(final IndexWriter writer, final File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            final String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    IndexFiles.indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (final FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                final Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                final Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:cn.fql.blogspider.IndexMain.java

License:Open Source License

static void indexDocs(IndexWriter writer, File file) throws IOException {
    if (file.canRead())
        if (file.isDirectory()) {
            String[] files = file.list();

            if (files != null)
                for (int i = 0; i < files.length; ++i)
                    indexDocs(writer, new File(file, files[i]));
        } else {/* w ww. ja va 2s .  com*/
            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                return;
            }

            try {
                Document doc = new Document();

                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                doc.add(new LongField("modified", file.lastModified(), Field.Store.YES));

                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }
            } finally {
                fis.close();
            }
        }
}

From source file:cn.larry.search.book.index.IndexFiles.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);/*from   w  ww .  j  av a2 s. c o m*/

        // Add the last modified date of the file a field named "modified".
        // Use a LongPoint that is indexed (i.e. efficiently filterable with
        // PointRangeQuery).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 4 would mean
        // February 17, 1, 2-3 PM.
        doc.add(new LongPoint("modified", lastModified));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so
            // we use updateDocument instead to replace the old one matching the exact
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:com.aurel.track.lucene.index.associatedFields.AttachmentIndexer.java

License:Open Source License

/**
 * Returns an attachment document based on reader content.
 * @param issueNo//from  w w w.  j a v a 2 s.  c om
 * @param originalName
 * @param realName
 * @param reader
 * @return
 */
private Document createAttachmentDocument(Integer attachmentID, Integer issueNo, String originalName,
        String realName, String description, Reader reader) {
    try {
        if (attachmentID != null && issueNo != null) {
            if (LOGGER.isDebugEnabled()) {
                LOGGER.debug("Creating the " + getLuceneFieldName() + " document from reader by attachmentID "
                        + attachmentID + " issueNo " + issueNo + " originalName " + originalName + " realName "
                        + realName + " description " + description + " reader not null "
                        + Boolean.valueOf(reader != null).toString());
            }
            Document document = new Document();
            document.add(new TextField(LuceneUtil.ATTACHMENT_INDEX_FIELDS.CONTENT, reader));
            document.add(new StringField(LuceneUtil.ATTACHMENT_INDEX_FIELDS.ATTACHMENTID,
                    attachmentID.toString(), Field.Store.YES));
            document.add(new StringField(LuceneUtil.ATTACHMENT_INDEX_FIELDS.ISSUENO, issueNo.toString(),
                    Field.Store.YES));
            document.add(new StringField(LuceneUtil.ATTACHMENT_INDEX_FIELDS.ORIGINALNAME, originalName,
                    Field.Store.YES));
            document.add(
                    new StringField(LuceneUtil.ATTACHMENT_INDEX_FIELDS.REALNAME, realName, Field.Store.YES));
            if (description != null) {
                document.add(new TextField(LuceneUtil.ATTACHMENT_INDEX_FIELDS.DESCRIPTION, description,
                        Field.Store.NO));
            }
            return document;
        }
    } catch (Exception e) {
        LOGGER.error("Creating the " + getLuceneFieldName() + " document from reader by attachmentID "
                + attachmentID + " issueNo " + issueNo + " originalName " + originalName + " realName "
                + realName + " description " + description + " reader not null "
                + Boolean.valueOf(reader != null).toString() + " failed with " + e.getMessage());
        LOGGER.debug(ExceptionUtils.getStackTrace(e));
    }
    return null;
}

From source file:com.chenyi.langeasy.lucene.IndexFiles.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        count++;/* ww w  . j a  v  a 2s. c  o  m*/
        if (count % 500 == 499) {
            System.out.println(count + "/" + new Date());
        }
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path". Use a
        // field that is indexed (i.e. searchable), but don't tokenize
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);

        // Add the last modified date of the file a field named "modified".
        // Use a LongField that is indexed (i.e. efficiently filterable with
        // NumericRangeFilter). This indexes to milli-second resolution,
        // which
        // is often too fine. You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you
        // require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongField("modified", lastModified, Field.Store.NO));

        // Add the contents of the file to a field named "contents". Specify
        // a Reader,
        // so that the text of the file is tokenized and indexed, but not
        // stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will
        // fail.
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can
            // be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been
            // indexed) so
            // we use updateDocument instead to replace the old one matching
            // the exact
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:com.codenvy.test.lucene.DeleteFilesWithSameName.java

License:Open Source License

private static void indexDocs(IndexWriter writer, Path file) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        Document doc = new Document();

        System.out.println("file path " + file.toAbsolutePath().toString());
        Field pathField = new StringField(PATH, file.toAbsolutePath().toString(), Field.Store.YES);
        doc.add(pathField);/*from  ww  w .j  ava 2s  .c  o m*/

        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {

            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            System.out.println("updating " + file);
            writer.updateDocument(new Term(PATH, file.toString()), doc);
        }
    }
}