List of usage examples for org.apache.lucene.index IndexWriter addDocument
public long addDocument(Iterable<? extends IndexableField> doc) throws IOException
From source file:com.github.msarhan.lucene.ArabicRootExtractorAnalyzerTests.java
License:Open Source License
private void addDoc(IndexWriter w, String title, String number) { Document doc = new Document(); doc.add(new TextField("title", title, Field.Store.YES)); doc.add(new StringField("number", number, Field.Store.YES)); try {//w ww . j av a 2 s . c o m w.addDocument(doc); } catch (IOException e) { e.printStackTrace(); } }
From source file:com.github.msarhan.lucene.ArabicRootExtractorAnalyzerTests.java
License:Open Source License
@Test public void testInlineStemmer() throws IOException, ParseException { //Initialize the index Directory index = new RAMDirectory(); Analyzer analyzer = new ArabicRootExtractorAnalyzer(); IndexWriterConfig config = new IndexWriterConfig(analyzer); IndexWriter writer = new IndexWriter(index, config); Document doc = new Document(); doc.add(new StringField("number", "1", Field.Store.YES)); doc.add(new TextField("title", "?? ? ? ??", Field.Store.YES));/* w w w .java2 s .c o m*/ writer.addDocument(doc); doc = new Document(); doc.add(new StringField("number", "2", Field.Store.YES)); doc.add(new TextField("title", "? ?? ? ?", Field.Store.YES)); writer.addDocument(doc); doc = new Document(); doc.add(new StringField("number", "3", Field.Store.YES)); doc.add(new TextField("title", "? ??", Field.Store.YES)); writer.addDocument(doc); writer.close(); //~ //Query the index String queryStr = ""; Query query = new QueryParser("title", analyzer).parse(queryStr); int hitsPerPage = 5; IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); TopDocs docs = searcher.search(query, hitsPerPage, Sort.INDEXORDER); ScoreDoc[] hits = docs.scoreDocs; //~ //Print results /* System.out.println("Found " + hits.length + " hits:"); for (ScoreDoc hit : hits) { int docId = hit.doc; Document d = searcher.doc(docId); System.out.printf("\t(%s): %s\n", d.get("number"), d.get("title")); } */ //~ }
From source file:com.github.tenorviol.gitsearch.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the//from w w w . j a v a 2 s .c o m * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException */ static void indexDocs(IndexWriter writer, File file) throws IOException { // TODO: make these exclusions configurable String fileName = file.getName(); if (fileName.charAt(0) == '.') { return; } int dotLoc = fileName.lastIndexOf('.'); String extension = fileName.substring(dotLoc + 1); // known binary extensions if (extension.equals("jpg") || extension.equals("png") || extension.equals("gif") || extension.equals("pdf") || extension.equals("fla") || extension.equals("flv") || extension.equals("swf") || extension.equals("swz")) { return; } // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new Field("path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); pathField.setIndexOptions(IndexOptions.DOCS_ONLY); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a NumericField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. NumericField modifiedField = new NumericField("modified"); modifiedField.setLongValue(file.lastModified()); doc.add(modifiedField); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new Field("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:com.github.tteofili.looseen.MinHashClassifier.java
License:Apache License
public MinHashClassifier(IndexReader reader, String textField, String categoryField, int min, int hashCount, int hashSize) { this.min = min; this.hashCount = hashCount; this.hashSize = hashSize; try {//from w w w .ja v a 2 s . c o m Analyzer analyzer = createMinHashAnalyzer(min, hashCount, hashSize); IndexWriterConfig config = new IndexWriterConfig(analyzer); directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, config); for (int i = 0; i < reader.maxDoc(); i++) { Document document = new Document(); Document d = reader.document(i); String textValue = d.getField(textField).stringValue(); String categoryValue = d.getField(categoryField).stringValue(); document.add(new TextField(TEXT_FIELD, textValue, Field.Store.NO)); document.add(new StringField(CLASS_FIELD, categoryValue, Field.Store.YES)); writer.addDocument(document); } writer.commit(); writer.close(); } catch (IOException e) { throw new RuntimeException(e); } BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE); }
From source file:com.github.tteofili.looseen.Test20NewsgroupsClassification.java
License:Apache License
void buildIndex(File indexDir, IndexWriter indexWriter) throws IOException { File[] groupsDir = indexDir.listFiles(); if (groupsDir != null) { for (File group : groupsDir) { String groupName = group.getName(); File[] posts = group.listFiles(); if (posts != null) { for (File postFile : posts) { String number = postFile.getName(); NewsPost post = parse(postFile, groupName, number); Document d = new Document(); d.add(new StringField(CATEGORY_FIELD, post.getGroup(), Field.Store.YES)); d.add(new SortedDocValuesField(CATEGORY_FIELD, new BytesRef(post.getGroup()))); d.add(new TextField(SUBJECT_FIELD, post.getSubject(), Field.Store.YES)); d.add(new TextField(BODY_FIELD, post.getBody(), Field.Store.YES)); indexWriter.addDocument(d); }/*w w w. ja va 2s .c o m*/ } } } indexWriter.commit(); }
From source file:com.github.tteofili.looseen.TestWikipediaClassification.java
License:Apache License
private static void importWikipedia(File dump, IndexWriter indexWriter) throws Exception { long start = System.currentTimeMillis(); int count = 0; System.out.format("Importing %s...%n", dump); String title = null;//from www .j av a2 s. c o m String text = null; Set<String> cats = new HashSet<>(); XMLInputFactory factory = XMLInputFactory.newInstance(); StreamSource source; if (dump.getName().endsWith(".xml")) { source = new StreamSource(dump); } else { throw new RuntimeException("can index only wikipedia XML files"); } XMLStreamReader reader = factory.createXMLStreamReader(source); while (reader.hasNext()) { if (count == Integer.MAX_VALUE) { break; } switch (reader.next()) { case XMLStreamConstants.START_ELEMENT: if ("title".equals(reader.getLocalName())) { title = reader.getElementText(); } else if (TEXT_FIELD.equals(reader.getLocalName())) { text = reader.getElementText(); Matcher matcher = pattern.matcher(text); int pos = 0; while (matcher.find(pos)) { String group = matcher.group(1); String catName = group.replaceAll("\\|\\s", "").replaceAll("\\|\\*", ""); Collections.addAll(cats, catName.split("\\|")); pos = matcher.end(); } } break; case XMLStreamConstants.END_ELEMENT: if ("page".equals(reader.getLocalName())) { Document page = new Document(); if (title != null) { page.add(new TextField(TITLE_FIELD, title, StoredField.Store.YES)); } if (text != null) { page.add(new TextField(TEXT_FIELD, text, StoredField.Store.YES)); } for (String cat : cats) { page.add(new StringField(CATEGORY_FIELD, cat, Field.Store.YES)); page.add(new SortedSetDocValuesField(CATEGORY_FIELD, new BytesRef(cat))); } indexWriter.addDocument(page); cats.clear(); count++; if (count % 100000 == 0) { indexWriter.commit(); System.out.format("Committed %d pages%n", count); } } break; } } indexWriter.commit(); long millis = System.currentTimeMillis() - start; System.out.format("Imported %d pages in %d seconds (%.2fms/page)%n", count, millis / 1000, (double) millis / count); }
From source file:com.github.wxiaoqi.search.lucene.LuceneDao.java
License:Open Source License
public void create(IndexObject indexObject) { IndexWriter indexWriter = null; try {//from w ww. j a v a 2 s.com IndexWriterConfig config = new IndexWriterConfig(this.getAnalyzer()); indexWriter = new IndexWriter(this.getDirectory(), config); indexWriter.addDocument(DocumentUtil.IndexObject2Document(indexObject)); indexWriter.commit(); } catch (Exception e) { e.printStackTrace(); try { indexWriter.rollback(); } catch (IOException e1) { e1.printStackTrace(); } } finally { try { indexWriter.close(); } catch (IOException e1) { e1.printStackTrace(); } } }
From source file:com.globalsight.ling.lucene.Index.java
License:Apache License
public void addDocument(long p_mainId, long p_subId, String p_text) throws IOException { synchronized (m_state) { if (m_state != STATE_OPENED) { throw new IOException("index is not available"); }//from w w w. j a v a2s . com } // clean cache if have LuceneCache.cleanLuceneCache(m_directory); try { m_lock.writeLock().acquire(); IndexWriter tempWriter = null; try { tempWriter = getIndexWriter(false); Document doc = getDocument(p_mainId, p_subId, p_text); tempWriter.addDocument(doc); } finally { m_lock.writeLock().release(); IOUtils.closeWhileHandlingException(tempWriter); } } catch (InterruptedException ex) { throw new IOException(ex.getMessage()); } }
From source file:com.globalsight.ling.tm2.lucene.LuceneIndexWriter.java
License:Apache License
/** * Indexes segments. To maintain index integrity, indexes are at * first created in memory and merged into a file system index. * * @param p_tuvs List of BaseTmTuv, SegmentsForSave.AddTuv, or TM3Tuv * @param p_sourceLocale true if p_tuvs are source locale segments * @param p_indexTargetLocales true for TM3, false for TM2 *//*from w w w . jav a2s . co m*/ public void index(List p_tuvs, boolean p_sourceLocale, boolean p_indexTargetLocales) throws Exception { IndexWriterConfig conf = new IndexWriterConfig(LuceneUtil.VERSION, m_analyzer); conf.setOpenMode(m_isFirst ? OpenMode.CREATE : OpenMode.CREATE_OR_APPEND); IndexWriter fsIndexWriter = new IndexWriter(m_directory, conf); try { for (Iterator it = p_tuvs.iterator(); it.hasNext();) { Object tuv = it.next(); Document doc = tuv instanceof BaseTmTuv ? createDocumentFromBaseTmTuv((BaseTmTuv) tuv, p_sourceLocale, p_indexTargetLocales) : tuv instanceof AddTuv ? createDocumentFromAddTuv((AddTuv) tuv, p_sourceLocale, p_indexTargetLocales) : tuv instanceof TM3Tuv ? createDocumentFromTM3Tuv((TM3Tuv<GSTuvData>) tuv, p_sourceLocale, p_indexTargetLocales) : null; fsIndexWriter.addDocument(doc); } } finally { fsIndexWriter.close(); } // clean cache if have LuceneCache.cleanLuceneCache(m_indexDir); }
From source file:com.gmail.mosoft521.luceneDemo.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * <p/>// ww w . ja va 2s . c om * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }