List of usage examples for org.apache.lucene.index IndexWriter addDocument
public long addDocument(Iterable<? extends IndexableField> doc) throws IOException
From source file:com.tamingtext.frankenstein.Frankenstein.java
License:Apache License
/** * Index the content of Frankenstein//w w w.ja v a2 s . com * * @throws IOException */ private void index() throws IOException { System.out.println("Indexing Frankenstein"); InputStream stream = getClass().getClassLoader().getResourceAsStream("frankenstein-gutenberg.txt"); BufferedReader reader = new BufferedReader(new InputStreamReader(stream)); //let's index paragraphs at a time IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)); directory = new RAMDirectory(); IndexWriter iw = new IndexWriter(directory, conf); String line; StringBuilder paraBuffer = new StringBuilder(2048); int lines = 0; int paragraphs = 0; int paragraphLines = 0; while ((line = reader.readLine()) != null) { if (line.contains("End of the Project Gutenberg")) {//we are in the license section at the end of the book break; } if (line.startsWith("#")) {//skip comments continue; } //if the line is blank, we have a paragraph, so let's index it if (line.matches("^\\s*$") && paraBuffer.length() > 0) { Document doc = new Document(); //We can retrieve by paragraph number if we want String theString = paraBuffer.toString(); theString.trim(); if (theString.length() > 0 && theString.matches("^\\s*$") == false) { addMetadata(doc, lines, paragraphs, paragraphLines); doc.add(new Field("paragraph", theString, Field.Store.YES, Field.Index.ANALYZED));//add the main content iw.addDocument(doc);//Index the document paragraphs++; } //reset some of our state paraBuffer.setLength(0);//we are done w/ this paragraph paragraphLines = 0; } else { paraBuffer.append(line).append(' '); } lines++; paragraphLines++; } System.out.println("Processed " + lines + " lines. Paragraphs: " + paragraphs); iw.close(); }
From source file:com.tamingtext.fuzzy.OverlapMeasures.java
License:Apache License
public TopDocs cosine(String queryTerm, int n, String... terms) throws IOException, ParseException { Directory directory = new RAMDirectory(); final Pattern pattern = Pattern.compile("."); Analyzer analyzer = new Analyzer() { @Override//from w w w . ja v a 2 s. co m public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = null; try { result = new PatternTokenizer(reader, pattern, 0); } catch (IOException e) { } return result; } }; IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, analyzer); IndexWriter writer = new IndexWriter(directory, conf); for (String term : terms) { Document doc = new Document(); doc.add(new Field("chars", term, Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); } writer.close(); IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); TopDocs topDocs = searcher.search(new MatchAllDocsQuery(), terms.length); for (int i = 0; i < topDocs.scoreDocs.length; i++) { System.out.println("Id: " + topDocs.scoreDocs[i].doc + " Val: " + searcher.doc(topDocs.scoreDocs[i].doc).get("chars")); } QueryParser qp = new QueryParser(Version.LUCENE_36, "chars", analyzer); Query query = qp.parse(queryTerm); return searcher.search(query, n); }
From source file:com.taobao.common.tedis.support.lucene.analysis.xanalyzer.TestHighLight.java
License:Open Source License
/** * @param args/*from w w w . j a v a2s.c o m*/ */ public static void main(String[] args) { Directory ramDir = new RAMDirectory(); try { IndexWriter writer = new IndexWriter(ramDir, /* * new * StandardAnalyzer()/ */XFactory.getWriterAnalyzer()); Document doc = new Document(); Field fd = new Field(FIELD_NAME, CONTENT, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(fd); writer.addDocument(doc); writer.optimize(); writer.close(); IndexReader reader = IndexReader.open(ramDir); String queryString = QUERY; QueryParser parser = new QueryParser(FIELD_NAME, /* * new * StandardAnalyzer * ()/ */XFactory.getWriterAnalyzer()); Query query = parser.parse(queryString); System.out.println(query); Searcher searcher = new IndexSearcher(ramDir); query = query.rewrite(reader); System.out.println(query); System.out.println("Searching for: " + query.toString(FIELD_NAME)); Hits hits = searcher.search(query); BoldFormatter formatter = new BoldFormatter(); Highlighter highlighter = new Highlighter(formatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(50)); for (int i = 0; i < hits.length(); i++) { String text = hits.doc(i).get(FIELD_NAME); int maxNumFragmentsRequired = 5; String fragmentSeparator = "..."; TermPositionVector tpv = (TermPositionVector) reader.getTermFreqVector(hits.id(i), FIELD_NAME); TokenStream tokenStream = TokenSources.getTokenStream(tpv); /* * TokenStream tokenStream2= (new StandardAnalyzer()) * //XFactory.getWriterAnalyzer() .tokenStream(FIELD_NAME,new * StringReader(text)); * * do { Token t = tokenStream2.next(); if(t==null)break; * System.out.println("\t" + t.startOffset() + "," + * t.endOffset() + "\t" + t.termText()); }while(true); */ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator); System.out.println("\n" + result); } reader.close(); } catch (Exception e) { e.printStackTrace(); } }
From source file:com.tekstosense.stemmer.index.Indexer.java
License:Open Source License
/** * Adds the doc.//from ww w. j ava 2s. com * * @param w * the w * @param title * the title * @param isbn * the isbn * @throws IOException * Signals that an I/O exception has occurred. */ private static void addDoc(IndexWriter w, String title, String isbn) throws IOException { Document doc = new Document(); doc.add(new TextField("title", title, Store.YES)); doc.add(new StringField("isbn", isbn, Store.YES)); w.addDocument(doc); }
From source file:com.test.LuceneDemo.java
License:Apache License
@Test public void test() throws IOException, org.apache.lucene.queryparser.classic.ParseException { Analyzer analyzer = new StandardAnalyzer(); // Store the index in memory: Directory directory = new RAMDirectory(); // To store an index on disk, use this instead: //Directory directory = FSDirectory.open("/tmp/testindex"); IndexWriterConfig config = new IndexWriterConfig(analyzer); IndexWriter iwriter = new IndexWriter(directory, config); Document doc = new Document(); String text = "This is the text to be indexed."; doc.add(new Field("fieldname", text, TextField.TYPE_STORED)); iwriter.addDocument(doc); iwriter.close();//www. j a v a 2s . co m // Now search the index: DirectoryReader ireader = DirectoryReader.open(directory); IndexSearcher isearcher = new IndexSearcher(ireader); // Parse a simple query that searches for "text": QueryParser parser = new QueryParser("fieldname", analyzer); Query query = parser.parse("indexed"); ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; assertEquals(1, hits.length); // Iterate through the results: for (int i = 0; i < hits.length; i++) { Document hitDoc = isearcher.doc(hits[i].doc); assertEquals("This is the text to be indexed.", hitDoc.get("fieldname")); } ireader.close(); directory.close(); }
From source file:com.tistory.devyongsik.demo.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * /* ww w . j a v a2s .c o m*/ * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); //10. . } } } else { FileInputStream fis; try { fis = new FileInputStream(file); //11. Stream . } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document //12. . Document Row. Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: //13. Document, Document . // . path path . // . Field pathField = new Field("path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); pathField.setOmitTermFreqAndPositions(true); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a NumericField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. //14. Field . // , . NumericField modifiedField = new NumericField("modified"); modifiedField.setLongValue(file.lastModified()); doc.add(modifiedField); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. //15. path, modified, contents Document . // , String, Numeric, Reader // . doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { //16. add... // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); //17. Create or Update update . // 3.X API . writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:com.vmware.dcp.services.common.LuceneBlobIndexService.java
License:Open Source License
protected void handlePost(Operation post) { if (post.isRemote()) { post.fail(new IllegalStateException("Remote requests not allowed")); return;/* ww w .j a v a2 s.c om*/ } Map<String, String> params = UriUtils.parseUriQueryParams(post.getUri()); String key = params.get(URI_PARAM_NAME_KEY); if (key == null) { post.fail(new IllegalArgumentException("key query parameter is required")); return; } String updateTimeParam = params.get(URI_PARAM_NAME_UPDATE_TIME); if (updateTimeParam == null) { post.fail(new IllegalArgumentException("update time query parameter is required")); return; } long updateTime = Long.parseLong(updateTimeParam); IndexWriter wr = this.writer; if (wr == null) { post.fail(new CancellationException()); return; } try { Object content = post.getBodyRaw(); if (content == null) { post.fail(new IllegalArgumentException("service instance is required")); return; } byte[] binaryContent = new byte[this.maxBinaryContextSizeBytes]; int count = Utils.toBytes(content, binaryContent, 0); Document doc = new Document(); Field binaryContentField = new StoredField(LUCENE_FIELD_NAME_BINARY_CONTENT, binaryContent, 0, count); doc.add(binaryContentField); Field keyField = new StringField(URI_PARAM_NAME_KEY, key, Field.Store.NO); doc.add(keyField); Field updateTimeField = new LongField(URI_PARAM_NAME_UPDATE_TIME, updateTime, this.longStoredField); doc.add(updateTimeField); wr.addDocument(doc); this.indexUpdateTimeMicros = Utils.getNowMicrosUtc(); post.setBody(null).complete(); } catch (Throwable e) { logSevere(e); post.fail(e); } }
From source file:com.vmware.dcp.services.common.LuceneDocumentIndexService.java
License:Open Source License
private void addDocumentToIndex(Operation op, Document doc, ServiceDocument sd, ServiceDocumentDescription desc) throws IOException { IndexWriter wr = this.writer; if (wr == null) { op.fail(new CancellationException()); return;//from w ww.ja v a2s .c o m } long start = Utils.getNowMicrosUtc(); wr.addDocument(doc); updateSelfLinkInfo(sd); long end = Utils.getNowMicrosUtc(); if (hasOption(ServiceOption.INSTRUMENTATION)) { ServiceStat s = getHistogramStat(STAT_NAME_INDEXING_DURATION_MICROS); setStat(s, end - start); } op.setBody(null).complete(); checkDocumentRetentionLimit(sd, desc); applyActiveQueries(sd, desc); }
From source file:com.vmware.xenon.services.common.LuceneBlobIndexService.java
License:Open Source License
public void handlePost(Operation post) { if (post.isRemote()) { post.fail(new IllegalStateException("Remote requests not allowed")); return;/*ww w. java 2s. c o m*/ } Map<String, String> params = UriUtils.parseUriQueryParams(post.getUri()); String key = params.get(URI_PARAM_NAME_KEY); if (key == null) { post.fail(new IllegalArgumentException("key query parameter is required")); return; } String updateTimeParam = params.get(URI_PARAM_NAME_UPDATE_TIME); if (updateTimeParam == null) { post.fail(new IllegalArgumentException("update time query parameter is required")); return; } long updateTime = Long.parseLong(updateTimeParam); IndexWriter wr = this.writer; if (wr == null) { post.fail(new CancellationException()); return; } try { Object content = post.getBodyRaw(); if (content == null) { post.fail(new IllegalArgumentException("service instance is required")); return; } byte[] binaryContent = getBuffer(); int count = Utils.toBytes(content, binaryContent, 0); Document doc = new Document(); Field binaryContentField = new StoredField(LUCENE_FIELD_NAME_BINARY_CONTENT, binaryContent, 0, count); doc.add(binaryContentField); Field keyField = new StringField(URI_PARAM_NAME_KEY, key, Field.Store.NO); doc.add(keyField); LuceneDocumentIndexService.addNumericField(doc, URI_PARAM_NAME_UPDATE_TIME, updateTime, true); wr.addDocument(doc); this.indexUpdateTimeMicros = Utils.getNowMicrosUtc(); post.setBody(null).complete(); } catch (Throwable e) { logSevere(e); post.fail(e); } }
From source file:com.vmware.xenon.services.common.LuceneDocumentIndexService.java
License:Open Source License
private void addDocumentToIndex(Operation op, Document doc, ServiceDocument sd, ServiceDocumentDescription desc) throws IOException { IndexWriter wr = this.writer; if (wr == null) { op.fail(new CancellationException()); return;/* w w w.ja va 2 s. co m*/ } long start = Utils.getNowMicrosUtc(); wr.addDocument(doc); long end = Utils.getNowMicrosUtc(); // Use time AFTER index was updated to be sure that it can be compared // against the time the searcher was updated and have this change // be reflected in the new searcher. If the start time would be used, // it is possible to race with updating the searcher and NOT have this // change be reflected in the searcher. updateLinkAccessTime(end, sd.documentSelfLink); if (hasOption(ServiceOption.INSTRUMENTATION)) { ServiceStat s = getHistogramStat(STAT_NAME_INDEXING_DURATION_MICROS); setStat(s, end - start); } op.setBody(null).complete(); checkDocumentRetentionLimit(sd, desc); applyActiveQueries(sd, desc); }