List of usage examples for org.apache.lucene.document TextField TextField
public TextField(String name, TokenStream stream)
From source file:FileIndexer.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField);// w ww. j a v a2 s.c om // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", lastModified, Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:DocIndexer.java
License:Apache License
private RAMDirectory index() throws IOException, UnsupportedEncodingException, FileNotFoundException { RAMDirectory directory = new RAMDirectory(); IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer(CharArraySet.EMPTY_SET)); config.setOpenMode(OpenMode.CREATE); config.setCommitOnClose(true);/*from w w w . j a v a2s . co m*/ try (IndexWriter iwriter = new IndexWriter(directory, config)) { for (String inputFile : inputFiles) { File file = new File(inputFile); if (file.length() == 0) { continue; } String title; try (BufferedReader titleReader = new BufferedReader( new InputStreamReader(new FileInputStream(file), "UTF-8"))) { title = titleReader.readLine(); if (title != null && title.startsWith("[[")) { // Generally the first line of the txt is the title. In a few cases the // first line is a "[[tag]]" and the second line is the title. title = titleReader.readLine(); } } Matcher matcher = SECTION_HEADER.matcher(title); if (matcher.matches()) { title = matcher.group(1); } String outputFile = AsciiDoctor.mapInFileToOutFile(inputFile, inExt, outExt); try (FileReader reader = new FileReader(file)) { Document doc = new Document(); doc.add(new TextField(Constants.DOC_FIELD, reader)); doc.add(new StringField(Constants.URL_FIELD, prefix + outputFile, Field.Store.YES)); doc.add(new TextField(Constants.TITLE_FIELD, title, Field.Store.YES)); iwriter.addDocument(doc); } } } return directory; }
From source file:back.Indexer.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * /*from w ww.j a v a2 s . c o m*/ * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:book.Indexer.java
License:Apache License
protected Document getDocument(File f) throws Exception { Document doc = new Document(); doc.add(new TextField("contents", new FileReader(f))); // was: doc.add(new Field("contents", new FileReader(f))); //7 doc.add(new StringField("filename", f.getName(), Field.Store.YES)); /* was: doc.add(new Field("filename", f.getName(), Field.Store.YES, Field.Index.NOT_ANALYZED));//8 */ doc.add(new StringField("fullpath", f.getCanonicalPath(), Field.Store.YES)); /* was: doc.add(new Field("fullpath", f.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED));// 9 */ return doc;//from w ww .j ava 2 s. c o m }
From source file:br.andrew.lucene.testing.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * /*from w w w.j a v a 2 s. c o m*/ * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(final IndexWriter writer, final File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { final String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { IndexFiles.indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (final FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document final Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: final Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:cn.fql.blogspider.IndexMain.java
License:Open Source License
static void indexDocs(IndexWriter writer, File file) throws IOException { if (file.canRead()) if (file.isDirectory()) { String[] files = file.list(); if (files != null) for (int i = 0; i < files.length; ++i) indexDocs(writer, new File(file, files[i])); } else {/* w ww. ja va 2s . com*/ FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { return; } try { Document doc = new Document(); Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); doc.add(new LongField("modified", file.lastModified(), Field.Store.YES)); doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) { System.out.println("adding " + file); writer.addDocument(doc); } else { System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } }
From source file:cn.larry.search.book.index.IndexFiles.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField);/*from w ww . j av a2 s. c o m*/ // Add the last modified date of the file a field named "modified". // Use a LongPoint that is indexed (i.e. efficiently filterable with // PointRangeQuery). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 4 would mean // February 17, 1, 2-3 PM. doc.add(new LongPoint("modified", lastModified)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:com.aurel.track.lucene.index.associatedFields.AttachmentIndexer.java
License:Open Source License
/** * Returns an attachment document based on reader content. * @param issueNo//from w w w. j a v a 2 s. c om * @param originalName * @param realName * @param reader * @return */ private Document createAttachmentDocument(Integer attachmentID, Integer issueNo, String originalName, String realName, String description, Reader reader) { try { if (attachmentID != null && issueNo != null) { if (LOGGER.isDebugEnabled()) { LOGGER.debug("Creating the " + getLuceneFieldName() + " document from reader by attachmentID " + attachmentID + " issueNo " + issueNo + " originalName " + originalName + " realName " + realName + " description " + description + " reader not null " + Boolean.valueOf(reader != null).toString()); } Document document = new Document(); document.add(new TextField(LuceneUtil.ATTACHMENT_INDEX_FIELDS.CONTENT, reader)); document.add(new StringField(LuceneUtil.ATTACHMENT_INDEX_FIELDS.ATTACHMENTID, attachmentID.toString(), Field.Store.YES)); document.add(new StringField(LuceneUtil.ATTACHMENT_INDEX_FIELDS.ISSUENO, issueNo.toString(), Field.Store.YES)); document.add(new StringField(LuceneUtil.ATTACHMENT_INDEX_FIELDS.ORIGINALNAME, originalName, Field.Store.YES)); document.add( new StringField(LuceneUtil.ATTACHMENT_INDEX_FIELDS.REALNAME, realName, Field.Store.YES)); if (description != null) { document.add(new TextField(LuceneUtil.ATTACHMENT_INDEX_FIELDS.DESCRIPTION, description, Field.Store.NO)); } return document; } } catch (Exception e) { LOGGER.error("Creating the " + getLuceneFieldName() + " document from reader by attachmentID " + attachmentID + " issueNo " + issueNo + " originalName " + originalName + " realName " + realName + " description " + description + " reader not null " + Boolean.valueOf(reader != null).toString() + " failed with " + e.getMessage()); LOGGER.debug(ExceptionUtils.getStackTrace(e)); } return null; }
From source file:com.chenyi.langeasy.lucene.IndexFiles.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { count++;/* ww w . j a v a 2s. c o m*/ if (count % 500 == 499) { System.out.println(count + "/" + new Date()); } // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, // which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you // require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", lastModified, Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify // a Reader, // so that the text of the file is tokenized and indexed, but not // stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will // fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can // be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been // indexed) so // we use updateDocument instead to replace the old one matching // the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:com.codenvy.test.lucene.DeleteFilesWithSameName.java
License:Open Source License
private static void indexDocs(IndexWriter writer, Path file) throws IOException { try (InputStream stream = Files.newInputStream(file)) { Document doc = new Document(); System.out.println("file path " + file.toAbsolutePath().toString()); Field pathField = new StringField(PATH, file.toAbsolutePath().toString(), Field.Store.YES); doc.add(pathField);/*from ww w .j ava 2s .c o m*/ doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) { System.out.println("adding " + file); writer.addDocument(doc); } else { System.out.println("updating " + file); writer.updateDocument(new Term(PATH, file.toString()), doc); } } }