List of usage examples for org.apache.lucene.index IndexWriter addDocument
public long addDocument(Iterable<? extends IndexableField> doc) throws IOException
From source file:com.google.gerrit.server.change.ReviewerSuggestionCache.java
License:Apache License
private void addAccount(IndexWriter writer, Account a) throws IOException, OrmException { Document doc = new Document(); doc.add(new IntField(ID, a.getId().get(), Store.YES)); if (a.getFullName() != null) { doc.add(new TextField(NAME, a.getFullName(), Store.YES)); }//w w w .j ava2 s .com if (a.getPreferredEmail() != null) { doc.add(new StringField(EMAIL, a.getPreferredEmail().toLowerCase(), Store.YES)); doc.add(new TextField(EMAIL, a.getPreferredEmail(), Store.YES)); } AccountExternalIdAccess extIdAccess = db.get().accountExternalIds(); String username = AccountState.getUserName(extIdAccess.byAccount(a.getId()).toList()); if (username != null) { doc.add(new StringField(USERNAME, username, Store.YES)); } writer.addDocument(doc); }
From source file:com.greplin.lucene.filter.PhraseFilterBenchmark.java
License:Apache License
public static void main(String[] argv) { Directory directory = new RAMDirectory(); try {/*from www .jav a 2s .c o m*/ IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_32, new WhitespaceAnalyzer(Version.LUCENE_32))); int done = 0; for (int i = 0; i < NUMBER_OF_SEGMENTS; i++) { int remaining = NUMBER_OF_SEGMENTS - i; int numberOfDocs; if (remaining == 1) { numberOfDocs = TOTAL_DOCS - done; } else { numberOfDocs = RANDOM.nextInt(TOTAL_DOCS - done - remaining) + 1; } done += numberOfDocs; System.out.println("Segment #" + i + " has " + numberOfDocs + " docs"); for (int d = 0; d < numberOfDocs; d++) { int wordCount = RANDOM.nextInt(WORDS_PER_DOC_DEVIATION * 2) + AVERAGE_WORDS_PER_DOC - WORDS_PER_DOC_DEVIATION; Document doc = new Document(); doc.add(new Field("f", Joiner.on(' ').join(words(wordCount)), Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("second", RANDOM.nextInt(100) < SECOND_FIELD_MATCH_PERCENTAGE ? "yes" : "no", Field.Store.NO, Field.Index.ANALYZED)); writer.addDocument(doc); } writer.commit(); } writer.close(); IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); String[][] queries = new String[TOTAL_QUERIES][]; Term[][] terms = new Term[TOTAL_QUERIES][]; for (int q = 0; q < TOTAL_QUERIES; q++) { queries[q] = words(WORDS_PER_QUERY[RANDOM.nextInt(WORDS_PER_QUERY.length)]); terms[q] = new Term[queries[q].length]; for (int qw = 0; qw < queries[q].length; qw++) { terms[q][qw] = new Term(FIELD, queries[q][qw]); } } // Warm up. new PhraseFilter(FIELD, queries[0]).getDocIdSet(reader); for (int round = 0; round < ROUNDS; round++) { System.out.println(); String name1 = "filter"; String name2 = "query"; long ms1 = 0, ms2 = 0; for (int step = 0; step < 2; step++) { System.gc(); System.gc(); System.gc(); if (step == (round & 1)) { long millis = System.currentTimeMillis(); long hits = 0; for (String[] queryWords : queries) { PhraseFilter pf = new PhraseFilter( new FilterIntersectionProvider(TermsFilter.from(new Term("second", "yes"))), FIELD, queryWords); hits += searcher.search(new FilteredQuery(new MatchAllDocsQuery(), pf), 1).totalHits; } ms1 = System.currentTimeMillis() - millis; System.out.println("Finished " + name1 + " in " + ms1 + "ms with " + hits + " hits"); } else { long millis = System.currentTimeMillis(); long hits = 0; for (Term[] queryTerms : terms) { PhraseQuery pq = new PhraseQuery(); for (Term term : queryTerms) { pq.add(term); } Query query = BooleanQueryBuilder.builder() .must(new TermQuery(new Term("second", "yes"))).must(pq).build(); hits += searcher.search(query, 1).totalHits; } ms2 = System.currentTimeMillis() - millis; System.out.println("Finished " + name2 + " in " + ms2 + "ms with " + hits + " hits"); } } System.out.println(name1 + " took " + (int) ((100.0 * ms1) / ms2) + "% as much time as " + name2); } } catch (IOException e) { e.printStackTrace(); } }
From source file:com.greplin.lucene.query.PredicateBonusQueryTest.java
License:Apache License
@Test public void testBasics() throws Exception { IndexWriter writer = new IndexWriter(this.directory, new IndexWriterConfig(Version.LUCENE_35, new WhitespaceAnalyzer(Version.LUCENE_35))); writer.addDocument(new DocumentBuilder().add("value", "5").build()); writer.close();/*from w w w .j a v a 2s . c o m*/ IndexReader reader = IndexReader.open(this.directory); IndexSearcher searcher = new IndexSearcher(reader); Query query = new ConstantScoreQuery(new TermQuery(new Term("value", "5"))); Assert.assertEquals(1.0, searcher.search(query, 1).getMaxScore(), 0.00001); Query noBonus = new PredicateBonusQuery(query, Predicates.NONE, 10.0f); Assert.assertEquals(1.0, searcher.search(noBonus, 1).getMaxScore(), 0.00001); Query bonus = new PredicateBonusQuery(query, Predicates.ALL, 100.0f); Assert.assertEquals(101.0, searcher.search(bonus, 1).getMaxScore(), 0.00001); Query noMatch = new TermQuery(new Term("value", "not5")); Assert.assertEquals(Double.NaN, searcher.search(noMatch, 1).getMaxScore(), 0.00001); Query noMatchNoBonus = new PredicateBonusQuery(noMatch, Predicates.NONE, 10.0f); Assert.assertEquals(Double.NaN, searcher.search(noMatchNoBonus, 1).getMaxScore(), 0.00001); Query noMatchIgnoresBonus = new PredicateBonusQuery(noMatch, Predicates.ALL, 100.0f); Assert.assertEquals(Double.NaN, searcher.search(noMatchIgnoresBonus, 1).getMaxScore(), 0.00001); }
From source file:com.heejong.lucene.IndexFiles.java
License:Apache License
/** Indexes a single document */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField);//from w w w. j a v a 2s . c o m // Add the last modified date of the file a field named "modified". // Use a LongPoint that is indexed (i.e. efficiently filterable with // PointRangeQuery). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 4 would mean // February 17, 1, 2-3 PM. doc.add(new LongPoint("modified", lastModified)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:com.ibm.jaql.lang.expr.index.BuildLuceneFn.java
License:Apache License
@Override public JsonValue eval(Context context) throws Exception { JsonRecord fd = (JsonRecord) exprs[1].eval(context); if (fd == null) { return null; }/* w ww.j a v a2 s .c o m*/ JsonString loc = (JsonString) fd.get(new JsonString("location")); if (loc == null) { return null; } Function keyFn = (Function) exprs[2].eval(context); if (keyFn == null) { return null; } Function valFn = (Function) exprs[3].eval(context); JsonIterator iter = exprs[0].iter(context); JsonValue[] fnArgs = new JsonValue[1]; Analyzer analyzer = new StandardAnalyzer(); IndexWriter writer = new IndexWriter(loc.toString(), analyzer, true); ByteArrayOutputStream buf = null; DataOutputStream out = null; if (valFn != null) { buf = new ByteArrayOutputStream(); out = new DataOutputStream(buf); } for (JsonValue value : iter) { fnArgs[0] = value; keyFn.setArguments(fnArgs); JsonIterator keyIter = keyFn.iter(context); Document doc = null; for (JsonValue key : keyIter) { JsonString jkey = (JsonString) key; if (doc == null) { doc = new Document(); } doc.add(new Field("key", jkey.toString(), Store.NO, Index.UN_TOKENIZED)); // TODO: typed keys, store binary value } if (doc != null) { if (valFn != null) { valFn.setArguments(fnArgs); JsonIterator valIter = valFn.iter(context); for (JsonValue val : valIter) { JsonRecord jrec = (JsonRecord) val; for (Entry<JsonString, JsonValue> e : jrec) { JsonString name = e.getKey(); JsonValue fval = e.getValue(); buf.reset(); serializer.write(out, fval); out.flush(); byte[] bytes = buf.toByteArray(); doc.add(new Field(name.toString(), bytes, Store.COMPRESS)); } } } writer.addDocument(doc); } } writer.optimize(); writer.close(); return fd; }
From source file:com.ibm.watson.developer_cloud.professor_languo.ingestion.indexing.LuceneIndexer.java
License:Open Source License
/** * Given a built corpus(a set of StackExchangeThreads without duplicates), an index writer and a * document mapper, write the indexing file with documents and record the statistics during the * indexing period./*www . j a v a 2 s . c o m*/ * * @param uniqueThreadDirPath - the path of the folder which stores the unique threads * @param writer - an index writer which can write document unit to the index file * @param docMapper - document mapper which maps the StackExchange instance to the document unit * @return the statistics during the indexing period. * @throws IngestionException */ private IndexingStats indexCorpus(String uniqueThreadDirPath, IndexWriter writer, DocumentMapper docMapper) throws IngestionException { List<Integer> indexThreadIds = new ArrayList<Integer>(); long startTime, endTime; int indexDocNum; StackExchangeThread thread = null; File[] serFiles = new File(uniqueThreadDirPath).listFiles(); try { startTime = System.currentTimeMillis(); // restore the uniqe StackExchangeThreads from the .ser Files and // index them for (File serFile : serFiles) { thread = StackExchangeThreadSerializer.deserializeThreadFromBinFile(serFile.getPath()); Document doc = docMapper.createDocument(thread); writer.addDocument(doc); indexThreadIds.add(thread.getId()); } endTime = System.currentTimeMillis(); indexDocNum = writer.numDocs(); closeIndexWriter(); } catch (IOException e) { throw new IngestionException(e); } return createIndexingStats(indexDocNum, indexThreadIds, endTime - startTime); }
From source file:com.icdd.lucence.IndexFiles.java
License:Apache License
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new,empty document Document doc = new Document(); Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField);//ww w .j a v a 2 s . c o m doc.add(new SortedNumericDocValuesField("modified", lastModified)); doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can // be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been // indexed) so // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:com.icdd.lucene.CreateIndex.java
License:Apache License
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { // filter non-xml files if (filter.accept(file.toFile())) { System.out.println("num: " + num); num++;//w w w . j a v a2 s . c om if (num < endset && num >= offset) { try (InputStream stream = Files.newInputStream(file)) { // make a new,empty document Document doc = new Document(); Field pathField = new StringField("path", file.toString(), Field.Store.YES); String filename = file.getFileName().toString(); int post = filename.indexOf('_'); if (post > 0) { filename = filename.substring(post + 1, filename.length() - 4); } doc.add(pathField); doc.add(new StringField("title", filename, Field.Store.YES)); doc.add(new SortedNumericDocValuesField("modified", lastModified)); doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old // document // can // be there): logger.info("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have // been // indexed) so // path, if present: logger.info("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } } } }
From source file:com.ikon.analysis.SearchDemo.java
License:Open Source License
/** * Add documents/*from ww w . j a v a2 s .co m*/ */ private static void add(Directory index, Analyzer analyzer, String str) throws IOException, ParseException { IndexWriterConfig config = new IndexWriterConfig(Config.LUCENE_VERSION, analyzer); IndexWriter w = new IndexWriter(index, config); Document doc = new Document(); doc.add(new Field(DOC_FIELD, str, Field.Store.YES, Field.Index.ANALYZED)); w.addDocument(doc); w.close(); }
From source file:com.impetus.kundera.index.LuceneIndexer.java
License:Apache License
/** * Indexes document in file system using lucene. * //from w w w . j av a 2s . c om * @param metadata * the metadata * @param document * the document */ public void indexDocument(EntityMetadata metadata, Document document) { if (log.isDebugEnabled()) { log.debug("Indexing document: {} for in file system using Lucene", document); } IndexWriter w = getIndexWriter(); try { w.addDocument(document); } catch (Exception e) { log.error("Error while indexing document {} into Lucene, Caused by:{} ", document, e); throw new LuceneIndexingException("Error while indexing document " + document + " into Lucene.", e); } }