List of usage examples for org.apache.lucene.index IndexWriter addDocument
public long addDocument(Iterable<? extends IndexableField> doc) throws IOException
From source file:com.shaie.UTF8Indexing.java
License:Apache License
@SuppressWarnings("resource") public static void main(String[] args) throws Exception { final Directory dir = new RAMDirectory(); final StandardAnalyzer analyzer = new StandardAnalyzer(); final IndexWriterConfig conf = new IndexWriterConfig(analyzer); final IndexWriter writer = new IndexWriter(dir, conf); final Document doc = new Document(); doc.add(new TextField("f", "Russia\u2013United States relations", Store.YES)); writer.addDocument(doc); writer.close();//from w w w . j a va 2 s .c o m final DirectoryReader reader = DirectoryReader.open(dir); final IndexSearcher searcher = new IndexSearcher(reader); final QueryParser qp = new QueryParser("f", analyzer); search(searcher, qp, "Russia United States relations"); search(searcher, qp, "\"Russia United states relations\""); search(searcher, qp, "\"Russia-United states relations\""); search(searcher, qp, "\"Russia\u2013United states relations\""); reader.close(); dir.close(); }
From source file:com.shmsoft.dmass.main.FileProcessor.java
License:Apache License
/** * Search metadata and file contents/*from w w w .j ava2s .c o m*/ * * @param metadata * @return true if match is found else false */ private boolean isResponsive(Metadata metadata) { // set true if search finds a match boolean isResponsive = false; // get culling parameters String queryString = Project.getProject().getCullingAsTextBlock(); // TODO parse important parameters to mappers and reducers individually, not globally IndexWriter writer = null; RAMDirectory idx = null; try { // construct a RAMDirectory to hold the in-memory representation of the index. idx = new RAMDirectory(); // make a writer to create the index writer = new IndexWriter(idx, new StandardAnalyzer(Version.LUCENE_30), true, IndexWriter.MaxFieldLength.UNLIMITED); writer.addDocument(createDocument(metadata)); // optimize and close the writer to finish building the index writer.optimize(); writer.close(); //adding the build index to FS if (Project.getProject().isLuceneFSIndexEnabled() && luceneIndex != null) { luceneIndex.addToIndex(idx); } SolrIndex.getInstance().addBatchData(metadata); if (queryString == null || queryString.trim().isEmpty()) { return true; } // build an IndexSearcher using the in-memory index Searcher searcher = new IndexSearcher(idx); // search directory isResponsive = search(searcher, queryString); searcher.close(); } catch (Exception e) { // TODO handle this better // if anything happens - don't stop processing e.printStackTrace(System.out); } finally { try { if (writer != null) { writer.close(); } if (idx != null) { idx.close(); } } catch (Exception e) { // swallow exception, what else can you do now? } } return isResponsive; }
From source file:com.silverwrist.dynamo.index.IndexServiceImpl.java
License:Mozilla Public License
public void addItem(String item_namespace, String item_name, Object item, String scope, java.util.Date date, DynamoUser owner, String text) throws IndexException { // Create a new Lucene Document containing the item information. Document doc = new Document(); doc.add(Field.Keyword("id", createTag(item_namespace, item_name, item))); doc.add(Field.Keyword("date", date)); doc.add(Field.Keyword("owner", owner.getName())); doc.add(Field.Keyword("scope", scope)); doc.add(Field.UnStored("text", text)); try { // Use an IndexWriter to write it to the index. IndexWriter iwr = new IndexWriter(m_directory, m_analyzer, false); iwr.addDocument(doc); iwr.close();/*from w ww. java 2 s.c om*/ } // end try catch (IOException e) { // translate Lucene's IOException here IndexException ie = new IndexException(IndexServiceImpl.class, "IndexMessages", "addItem.fail", e); ie.setParameter(0, item_namespace); ie.setParameter(1, item_name); ie.setParameter(2, m_identity.toString()); throw ie; } // end catch }
From source file:com.slieer.app.lecene3x.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is * given, recurses over files and directories found under the given * directory./*from w w w . ja v a2 s. co m*/ * * NOTE: This method indexes one document per input file. This is slow. For * good throughput, put multiple documents into your input file(s). An * example of this is in the benchmark module, which can create "line doc" * files, one document per line, using the <a href= * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer * Writer to the index where the given file/dir info will be * stored * @param file * The file to index, or the directory to recurse into to find * files to index * @throws IOException * If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this // exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't // tokenize // the field into separate words and don't index term // frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named // "modified". // Use a LongField that is indexed (i.e. efficiently // filterable with // NumericRangeFilter). This indexes to milli-second // resolution, which // is often too fine. You could instead create a number // based on // year/month/day/hour/minutes/seconds, down the resolution // you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". // Specify a Reader, // so that the text of the file is tokenized and indexed, // but not stored. // Note that FileReader expects the file to be in UTF-8 // encoding. // If that's not the case searching for special characters // will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old // document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have // been indexed) so // we use updateDocument instead to replace the old one // matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:com.slieer.app.lecene3x.LuceneIndexAndSearchDemo.java
License:Apache License
/** * ???/*w w w . j a v a2 s. co m*/ * * @param args */ public static void main(String[] args) { // Lucene Document?? String fieldName = "text"; // String text = "IK Analyzer???????"; String text1 = "? (Chinese Word Segmentation) ???????????"; String text2 = "?????,,??,?"; // IKAnalyzer? Analyzer analyzer = new IKAnalyzer(true); Directory directory = null; IndexWriter iwriter = null; IndexReader ireader = null; IndexSearcher isearcher = null; try { // directory = new RAMDirectory(); // ?IndexWriterConfig IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); iwriter = new IndexWriter(directory, iwConfig); // Document doc = new Document(); //document.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED)); Field strField = new StringField("ID", "10000", Field.Store.YES); Field textFild = new StringField(fieldName, text, Field.Store.YES); //textFild.setBoost(2); doc.add(strField); doc.add(textFild); iwriter.addDocument(doc); doc = new Document(); strField = new StringField("ID", "10001", Field.Store.YES); textFild = new StringField(fieldName, text1, Field.Store.YES); //strField.setBoost(1); doc.add(strField); doc.add(textFild); iwriter.addDocument(doc); doc = new Document(); strField = new StringField("ID", "10002", Field.Store.YES); // textFild = new TextField(fieldName, text2, Field.Store.YES); textFild = new StringField(fieldName, text2, Field.Store.YES); //strField.setBoost(1); doc.add(strField); doc.add(textFild); iwriter.addDocument(doc); iwriter.close(); // ?********************************** // ? ireader = DirectoryReader.open(directory); isearcher = new IndexSearcher(ireader); String keyword = "?"; // QueryParser?Query QueryParser qp = new QueryParser(Version.LUCENE_4_9, fieldName, analyzer); qp.setDefaultOperator(QueryParser.AND_OPERATOR); Query query = qp.parse(keyword); System.out.println("Query = " + query); // ?5? TopDocs topDocs = isearcher.search(query, 5); System.out.println("" + topDocs.totalHits); // ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (int i = 0; i < topDocs.totalHits; i++) { Document targetDoc = isearcher.doc(scoreDocs[i].doc); System.out.println("" + targetDoc.toString()); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { e.printStackTrace(); } } if (directory != null) { try { directory.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:com.slieer.lucene.apachedemo.IndexFiles.java
License:Apache License
static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); }//from w ww. j a va2 s.c o m } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this // exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't // tokenize // the field into separate words and don't index term // frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named // "modified". // Use a LongField that is indexed (i.e. efficiently // filterable with // NumericRangeFilter). This indexes to milli-second // resolution, which // is often too fine. You could instead create a number // based on // year/month/day/hour/minutes/seconds, down the resolution // you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". // Specify a Reader, // so that the text of the file is tokenized and indexed, // but not stored. // Note that FileReader expects the file to be in UTF-8 // encoding. // If that's not the case searching for special characters // will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old // document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have // been indexed) so // we use updateDocument instead to replace the old one // matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:com.soebes.supose.core.lucene.LuceneTest.java
License:Open Source License
@BeforeClass public void beforeClass() throws CorruptIndexException, LockObtainFailedException, IOException { Analyzer analyzer = AnalyzerFactory.createInstance(); // To store an index on disk, use this instead: // Directory directory = FSDirectory.getDirectory("/tmp/testindex"); IndexWriter iwriter = new IndexWriter(directory, analyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); iwriter.setMaxFieldLength(25000);/* w ww.j ava2s. c om*/ Document doc = new Document(); String text = "This is the text to be indexed."; addUnTokenizedField(doc, FieldNames.REVISION.getValue(), NumberUtils.pad(1)); addTokenizedField(doc, FieldNames.CONTENTS.getValue(), text); addUnTokenizedField(doc, FieldNames.FILENAME.getValue(), "/trunk/doc/testXML.doc"); iwriter.addDocument(doc); doc = new Document(); text = "This is different text."; addUnTokenizedField(doc, FieldNames.REVISION.getValue(), NumberUtils.pad(2)); addTokenizedField(doc, FieldNames.CONTENTS.getValue(), text); addUnTokenizedField(doc, FieldNames.FILENAME.getValue(), "/tags/docs/XYZabc.java"); iwriter.addDocument(doc); doc = new Document(); text = "This is more different text."; addUnTokenizedField(doc, FieldNames.REVISION.getValue(), NumberUtils.pad(3)); addTokenizedField(doc, FieldNames.CONTENTS.getValue(), text); addUnTokenizedField(doc, FieldNames.FILENAME.getValue(), "/tags/docs/SCMPlan.doc"); iwriter.addDocument(doc); doc = new Document(); text = "This is the third text."; addUnTokenizedField(doc, FieldNames.REVISION.getValue(), NumberUtils.pad(4)); addTokenizedField(doc, FieldNames.CONTENTS.getValue(), text); addUnTokenizedField(doc, FieldNames.FILENAME.getValue(), "/trunk/subdir/elviraXML.doc"); iwriter.addDocument(doc); iwriter.optimize(); iwriter.close(); isearcher = new IndexSearcher(directory); }
From source file:com.soebes.supose.core.scan.IndexMergeTest.java
License:Open Source License
public void testIndex1() throws Exception { Index index = new Index(); IndexWriter indexWriter = index.createIndexWriter("index1"); Document doc = new Document(); addTokenizedField(doc, "revision", "1"); addTokenizedField(doc, "revision", "2"); indexWriter.addDocument(doc); indexWriter.close();/* w w w. jav a2s . c om*/ }
From source file:com.soebes.supose.core.scan.IndexMergeTest.java
License:Open Source License
public void testIndex2() throws Exception { Index index = new Index(); IndexWriter indexWriter = index.createIndexWriter("index2"); Document doc = new Document(); addTokenizedField(doc, "revision", "3"); addTokenizedField(doc, "revision", "4"); indexWriter.addDocument(doc); indexWriter.close();/* w w w . ja va 2 s. co m*/ }
From source file:com.soebes.supose.core.scan.ScanRepository.java
License:Open Source License
/** * The method will index a particular document (file) into the Lucene index. * It will store the majority of the information about a file into the * Lucene index like revision, copyfrom, path, filename etc. * * @param doc/*w w w . jav a 2 s . c o m*/ * @param indexWriter * @param dirEntry * @param repository * @param logEntry * @param entryPath * @throws SVNException * @throws IOException */ private void indexFile(RevisionDocument indexRevision, IndexWriter indexWriter, SVNDirEntry dirEntry, SVNLogEntry logEntry, SVNLogEntryPath entryPath) throws SVNException, IOException { SVNProperties fileProperties = new SVNProperties(); SVNNodeKind nodeKind = null; // if the entry has been deleted we will check the information about the // entry // via the revision before... LOGGER.debug("Before checking..."); nodeKind = repository.getRepository().checkPath(entryPath.getPath(), logEntry.getRevision()); LOGGER.debug("After checking..."); indexRevision.addUnTokenizedField(FieldNames.REVISION, NumberUtils.pad(logEntry.getRevision())); boolean isDir = nodeKind == SVNNodeKind.DIR; boolean isFile = nodeKind == SVNNodeKind.FILE; FileName fileName = null; if (isDir) { LOGGER.debug("The " + entryPath.getPath() + " is a directory entry."); indexRevision.addUnTokenizedField(FieldNames.NODE, "dir"); fileName = new FileName(entryPath.getPath(), true); if (getFiltering().ignorePath(fileName.getPath())) { // Ignore the path... if (LOGGER.isDebugEnabled()) { LOGGER.debug("The following " + fileName.getPath() + " is beeing ignored based on filtering (ignorePath())."); } return; } } else if (isFile) { LOGGER.debug("The " + entryPath.getPath() + " is a file entry."); indexRevision.addUnTokenizedField(FieldNames.NODE, "file"); fileName = new FileName(entryPath.getPath(), false); if (getFiltering().ignoreFilename(fileName.getBaseName())) { if (LOGGER.isDebugEnabled()) { LOGGER.debug("The following " + fileName.getBaseName() + " is beeing ignored based on filtering (ignoreFilename())."); } // Ignore filename return; } } else { // This means a file/directory has been deleted. indexRevision.addUnTokenizedField(FieldNames.NODE, "unknown"); LOGGER.debug("The " + entryPath.getPath() + " is an unknown entry."); // We would like to know what is has been? // Directory? File? So we go a step back in History... long rev = logEntry.getRevision() - 1; SVNNodeKind nodeKindUnknown = getRepository().getRepository().checkPath(entryPath.getPath(), rev); LOGGER.debug("NodeKind(" + rev + "): " + nodeKindUnknown.toString()); fileName = new FileName(entryPath.getPath(), nodeKindUnknown == SVNNodeKind.DIR); } if (LOGGER.isDebugEnabled()) { LOGGER.debug( "FileNameCheck: entryPath -> kind:" + nodeKind.toString() + " path:" + entryPath.getPath()); LOGGER.debug("FileNameCheck: path:'" + fileName.getPath() + "' filename:'" + fileName.getBaseName() + "'"); } // TODO: We have to check if we need to set localization indexRevision.addUnTokenizedFieldNoStore(FieldNames.PATH, fileName.getPath().toLowerCase()); indexRevision.addUnTokenizedField(FieldNames.PATH, fileName.getPath()); // Does a copy operation took place... if (entryPath.getCopyPath() != null) { indexRevision.addUnTokenizedField(FieldNames.FROM, entryPath.getCopyPath()); indexRevision.addUnTokenizedField(FieldNames.FROMREV, entryPath.getCopyRevision()); } // The field we use for searching is stored as lowercase. // TODO: We have to check if we need to set localization indexRevision.addUnTokenizedFieldNoStore(FieldNames.FILENAME, fileName.getBaseName().toLowerCase()); indexRevision.addUnTokenizedField(FieldNames.FILENAME, fileName.getBaseName()); indexRevision.addUnTokenizedField(FieldNames.AUTHOR, logEntry.getAuthor() == null ? "" : logEntry.getAuthor()); // We will add the message as tokenized field to be able to search // within the log messages. indexRevision.addTokenizedField(FieldNames.MESSAGE, logEntry.getMessage() == null ? "" : logEntry.getMessage()); indexRevision.addUnTokenizedField(FieldNames.DATE, logEntry.getDate()); indexRevision.addUnTokenizedField(FieldNames.KIND, String.valueOf(entryPath.getType()).toLowerCase()); // TODO: May be don't need this if we use repository name? indexRevision.addUnTokenizedField(FieldNames.REPOSITORYUUID, getRepository().getRepository().getRepositoryUUID(false)); indexRevision.addUnTokenizedField(FieldNames.REPOSITORY, getName()); if (nodeKind == SVNNodeKind.NONE) { LOGGER.debug("The " + entryPath.getPath() + " is a NONE entry."); } else if (nodeKind == SVNNodeKind.DIR) { // The given entry is a directory. LOGGER.debug("The " + entryPath.getPath() + " is a directory."); // Here we need to call getDir to get directory properties. Collection<SVNDirEntry> dirEntries = null; getRepository().getRepository().getDir(entryPath.getPath(), logEntry.getRevision(), fileProperties, dirEntries); indexProperties(fileProperties, indexRevision); } else if (nodeKind == SVNNodeKind.FILE) { // The given entry is a file. // This means we will get every file from the repository.... // Get only the properties of the file indexRevision.addTokenizedField(FieldNames.SIZE, Long.toString(dirEntry.getSize())); getRepository().getRepository().getFile(entryPath.getPath(), logEntry.getRevision(), fileProperties, null); indexProperties(fileProperties, indexRevision); FileExtensionHandler feh = new FileExtensionHandler(); feh.setFileProperties(fileProperties); feh.setDoc(indexRevision); feh.execute(getRepository(), dirEntry, entryPath.getPath(), logEntry.getRevision()); } indexWriter.addDocument(indexRevision.getDoc()); LOGGER.debug("File " + entryPath.getPath() + " indexed..."); }