List of usage examples for org.apache.lucene.index IndexWriterConfig IndexWriterConfig
public IndexWriterConfig(Analyzer analyzer)
From source file:MyServlet.java
public void init() throws ServletException { try {//from ww w .jav a 2s. co m // Specify the analyzer for tokenizing text. // The same analyzer should be used for indexing and searching analyzer = new StandardAnalyzer(); // Code to create the index index = new RAMDirectory(); config = new IndexWriterConfig(analyzer); w = new IndexWriter(index, config); addDoc(w, " Software Engineering 2", "CMPE 133", "Mon.", "Computer Engineering"); addDoc(w, " Software Engineering 1", "CMPE 131", "Mon.", "Computer Engineering"); addDoc(w, " Object Oriented Design", "CS 151", "Mon.", "Computer Science"); addDoc(w, " Advance Data Structures with Java ", "CS 146:", "Mon.", "Computer Science"); addDoc(w, " System Security with Java", "CS 166:", "Mon.", "Computer Science"); addDoc(w, "Liner math", "ME 123", "Mon.", "Math"); w.close(); log = new searchHistory(); for (int i = 1; i <= 10; i++) { Student std = new Student(); std.setUserName("std" + i); std.setPassword("123"); stds.add(std); Teacher tch = new Teacher(); tch.setUserName("tch" + i); tch.setPassword("123"); tchs.add(tch); } } catch (Exception e) { System.out.println(e.getMessage()); } System.out.println("init"); }
From source file:MyServlet.java
private void gotoAddCourse(PrintWriter out, HttpServletRequest request, HttpServletResponse response) { String name = request.getParameter("name"); String id = request.getParameter("id"); String department = request.getParameter("department"); String time = request.getParameter("time"); try {//from w ww .j ava 2s .c o m IndexWriterConfig newConfig = new IndexWriterConfig(analyzer); w = new IndexWriter(index, newConfig); addDoc(w, name, id, time, department); System.out.println(name); w.close(); } catch (Exception e) { System.out.println(e.getMessage()); } String msg = "Create course success"; String title = "Create course success"; gotoMsg(out, request, response, title, msg); }
From source file:FileIndexer.java
License:Apache License
public static void main(String[] args) { String usage = "java FileIndexer" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-excludes FILE] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles\n" + "excludes is an optional list of files to be excluded, one per line."; String indexPath = "index"; String docsPath = null;/*from w w w . java 2s .c o m*/ boolean create = true; List<String> excludes = new ArrayList<String>(); for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1]; i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } else if ("-excludes".equals(args[i])) { Scanner sc = null; try { sc = new Scanner(new File(args[i + 1])); i++; } catch (FileNotFoundException fnfe) { System.err.println(fnfe.getMessage()); System.exit(1); } while (sc.hasNext()) { excludes.add(sc.next()); } sc.close(); } else if ("-update".equals(args[i])) { create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final Path docDir = Paths.get(docsPath); if (!Files.isReadable(docDir)) { System.out.println("Document directory '" + docDir.toAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(Paths.get(indexPath)); Analyzer analyzer = new LimitTokenCountAnalyzer(new StandardAnalyzer(), 1000000); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir, excludes); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:IrqaQuery.java
License:Apache License
public static void makeIndexWriter(String indexPath, String stopPath, String sim) throws IOException { System.out.println("[makeIndexWriter] started"); System.out.println("[makeIndexWriter]" + stopPath); Directory dir = FSDirectory.open(Paths.get(indexPath)); Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stopPath))); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (sim.equals("TFIDF")) iwc.setSimilarity(new ClassicSimilarity()); else if (sim.equals("BM25")) iwc.setSimilarity(new BM25Similarity()); else/*from www .j a v a2 s . c o m*/ iwc.setSimilarity(new BM25Similarity()); writer = new IndexWriter(dir, iwc); }
From source file:DocIndexer.java
License:Apache License
private RAMDirectory index() throws IOException, UnsupportedEncodingException, FileNotFoundException { RAMDirectory directory = new RAMDirectory(); IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer(CharArraySet.EMPTY_SET)); config.setOpenMode(OpenMode.CREATE); config.setCommitOnClose(true);//from w w w .ja v a 2 s . c o m try (IndexWriter iwriter = new IndexWriter(directory, config)) { for (String inputFile : inputFiles) { File file = new File(inputFile); if (file.length() == 0) { continue; } String title; try (BufferedReader titleReader = new BufferedReader( new InputStreamReader(new FileInputStream(file), "UTF-8"))) { title = titleReader.readLine(); if (title != null && title.startsWith("[[")) { // Generally the first line of the txt is the title. In a few cases the // first line is a "[[tag]]" and the second line is the title. title = titleReader.readLine(); } } Matcher matcher = SECTION_HEADER.matcher(title); if (matcher.matches()) { title = matcher.group(1); } String outputFile = AsciiDoctor.mapInFileToOutFile(inputFile, inExt, outExt); try (FileReader reader = new FileReader(file)) { Document doc = new Document(); doc.add(new TextField(Constants.DOC_FIELD, reader)); doc.add(new StringField(Constants.URL_FIELD, prefix + outputFile, Field.Store.YES)); doc.add(new TextField(Constants.TITLE_FIELD, title, Field.Store.YES)); iwriter.addDocument(doc); } } } return directory; }
From source file:DVBench.java
License:Apache License
static void doBench(int bpv) throws Exception { File file = new File("/data/indices/dvbench"); file.mkdirs();/*from www.j ava 2 s .co m*/ Directory dir = FSDirectory.open(file); IndexWriterConfig config = new IndexWriterConfig(null); config.setOpenMode(OpenMode.CREATE); config.setMergeScheduler(new SerialMergeScheduler()); config.setMergePolicy(new LogDocMergePolicy()); config.setMaxBufferedDocs(25000); IndexWriter writer = new IndexWriter(dir, config); MyRandom r = new MyRandom(); int numdocs = 400000; Document doc = new Document(); Field dv = new NumericDocValuesField("dv", 0); Field inv = new LongField("inv", 0, Field.Store.NO); Field boxed = new BinaryDocValuesField("boxed", new BytesRef(8)); Field boxed2 = new BinaryDocValuesField("boxed2", new BytesRef(8)); doc.add(dv); doc.add(inv); doc.add(boxed); doc.add(boxed2); for (int i = 0; i < numdocs; i++) { // defeat blockpackedwriter final long value; if (i % 8192 == 0) { value = bpv == 64 ? Long.MIN_VALUE : 0; } else if (i % 8192 == 1) { value = bpv == 64 ? Long.MAX_VALUE : (1L << bpv) - 1; } else { value = r.nextLong(bpv); } dv.setLongValue(value); inv.setLongValue(value); box(value, boxed.binaryValue()); box(value, boxed2.binaryValue()); boxed2.binaryValue().length = (bpv + 7) / 8; // fixed length writer.addDocument(doc); } writer.close(); // run dv search tests String description = "dv (bpv=" + bpv + ")"; DirectoryReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); searcher.setQueryCache(null); // don't bench the cache int hash = 0; // warmup hash += search(description, searcher, "dv", 300, true); hash += search(description, searcher, "dv", 300, false); // Uninverting Map<String, UninvertingReader.Type> mapping = Collections.singletonMap("inv", UninvertingReader.Type.LONG); DirectoryReader uninv = UninvertingReader.wrap(reader, mapping); IndexSearcher searcher2 = new IndexSearcher(uninv); searcher2.setQueryCache(null); // don't bench the cache description = "fc (bpv=" + bpv + ")"; // warmup hash += search(description, searcher2, "inv", 300, true); hash += search(description, searcher2, "inv", 300, false); // Boxed inside binary DirectoryReader boxedReader = new BinaryAsVLongReader(reader); IndexSearcher searcher3 = new IndexSearcher(boxedReader); searcher3.setQueryCache(null); // don't bench the cache description = "boxed (bpv=" + bpv + ")"; // warmup hash += search(description, searcher3, "boxed", 300, true); hash += search(description, searcher3, "boxed", 300, false); description = "boxed fixed-length (bpv=" + bpv + ")"; // warmup hash += search(description, searcher3, "boxed2", 300, true); hash += search(description, searcher3, "boxed2", 300, false); if (hash == 3) { // wont happen System.out.println("hash=" + hash); } reader.close(); dir.close(); }
From source file:IndexAndSearchOpenStreetMaps1D.java
License:Apache License
private static void createIndex() throws IOException { long t0 = System.nanoTime(); CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); int BUFFER_SIZE = 1 << 16; // 64K InputStream is = Files .newInputStream(Paths.get("/lucenedata/open-street-maps/latlon.subsetPlusAllLondon.txt")); BufferedReader reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE); Directory dir = FSDirectory.open(Paths.get("/c/tmp/bkdtest1d" + (USE_NF ? "_nf" : ""))); IndexWriterConfig iwc = new IndexWriterConfig(null); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); //iwc.setMaxBufferedDocs(109630); //iwc.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH); iwc.setRAMBufferSizeMB(256.0);//from ww w . ja v a2s . c o m iwc.setMergePolicy(new LogDocMergePolicy()); iwc.setMergeScheduler(new SerialMergeScheduler()); iwc.setInfoStream(new PrintStreamInfoStream(System.out)); IndexWriter w = new IndexWriter(dir, iwc); int count = 0; byte[] scratch = new byte[4]; while (true) { String line = reader.readLine(); if (line == null) { break; } String[] parts = line.split(","); //long id = Long.parseLong(parts[0]); int lat = (int) (1000000. * Double.parseDouble(parts[1])); //int lon = (int) (1000000. * Double.parseDouble(parts[2])); Document doc = new Document(); if (USE_NF) { doc.add(new LegacyIntField("latnum", lat, Field.Store.NO)); //doc.add(new LongField("lonnum", lon, Field.Store.NO)); } else { doc.add(new IntPoint("lat", lat)); //doc.add(new SortedNumericDocValuesField("lon", lon)); } w.addDocument(doc); count++; if (count % 1000000 == 0) { System.out.println(count + "..."); } } //w.forceMerge(1); w.commit(); System.out.println(w.maxDoc() + " total docs"); w.close(); long t1 = System.nanoTime(); System.out.println(((t1 - t0) / 1000000000.0) + " sec to build index"); }
From source file:alix.lucene.Alix.java
License:Open Source License
/** * Start to scan the glob of xml files// w w w . ja v a2 s.c om * * @param indexDir where the lucene indexes are generated * @param anAnalyzer Analyzer to use for analyzed fields * @param similarity instance of Similarity to work with the writer * @throws TransformerConfigurationException */ static public void walk(String xmlGlob, String xslFile, String indexDir) throws IOException, TransformerConfigurationException { info("Lucene, src:" + xmlGlob + " parser:" + xslFile + " index:" + indexDir); Path srcDir = Paths.get(xmlGlob); PathMatcher glob = FileSystems.getDefault().getPathMatcher("glob:*.xml"); if (!Files.isDirectory(srcDir)) { String pattern = srcDir.getFileName().toString(); glob = FileSystems.getDefault().getPathMatcher("glob:" + pattern); srcDir = srcDir.getParent(); } if (!Files.isDirectory(srcDir)) { fatal("FATAL " + srcDir + " NOT FOUND"); } Path indexPath = Paths.get(indexDir); Files.createDirectories(indexPath); Directory dir = FSDirectory.open(indexPath); // TODO configure analyzers Analyzer analyzer = new XmlAnalyzer(); IndexWriterConfig conf = new IndexWriterConfig(analyzer); conf.setOpenMode(OpenMode.CREATE_OR_APPEND); conf.setSimilarity(new BM25Similarity()); conf.setCodec(new ChapitreCodec()); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // conf.setRAMBufferSizeMB(256.0); lucwriter = new IndexWriter(dir, conf); System.setProperty("javax.xml.transform.TransformerFactory", "net.sf.saxon.TransformerFactoryImpl"); TransformerFactory tf = TransformerFactory.newInstance(); tf.setAttribute("http://saxon.sf.net/feature/version-warning", Boolean.FALSE); tf.setAttribute("http://saxon.sf.net/feature/recoveryPolicy", new Integer(0)); parser = tf.newTransformer(new StreamSource(xslFile)); final PathMatcher matcher = glob; // transmit the matcher by a final variable to the anonymous class Files.walkFileTree(srcDir, new SimpleFileVisitor<Path>() { @Override public FileVisitResult visitFile(Path path, BasicFileAttributes attrs) { if (path.getFileName().toString().startsWith(".")) return FileVisitResult.CONTINUE; if (!matcher.matches(path.getFileName())) return FileVisitResult.CONTINUE; parse(path); return FileVisitResult.CONTINUE; } public FileVisitResult preVisitDirectory(Path path, BasicFileAttributes attrs) { // .git, .svn if (path.getFileName().toString().startsWith(".")) return FileVisitResult.SKIP_SUBTREE; return FileVisitResult.CONTINUE; } }); lucwriter.commit(); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // lucwriter.forceMerge(1); lucwriter.close(); }
From source file:antnlp.opie.indexsearch.IndexFiles.java
License:Apache License
/** Index all text files under a directory. */ public static void main(String[] args) { String usage = "java org.apache.lucene.demo.IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles"; String indexPath = "index"; String docsPath = null;//from w w w . java 2 s . c o m boolean create = true; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1]; i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } else if ("-update".equals(args[i])) { create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final Path docDir = Paths.get(docsPath); if (!Files.isReadable(docDir)) { System.out.println("Document directory '" + docDir.toAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(Paths.get(indexPath)); //Analyzer analyzer = new StandardAnalyzer(); //Analyzer analyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET); Analyzer analyzer = new WhitespaceAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:api.startup.PDFIndexer.java
License:Open Source License
/** * Updates the index//from w w w. j a v a 2s .c o m * * @throws IOException */ public void updateIndex() throws IOException { try { long startTime = System.nanoTime(); // Get the index directory Directory dir = FSDirectory.open(Paths.get(indexDirectory)); // Get the directory for resources String resourcesDir = resourceDirectory + "/" + Constants.CSV_LOCATION; // Get PDF Analyzer Analyzer pdf_analyzer = new PDFAnalyzer(resourceDirectory + "/" + Constants.STOPWORDS_FILE); // Create an index writer config with the analyzer IndexWriterConfig iwc = new IndexWriterConfig(pdf_analyzer); // Set the open mode to create or append iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); // Set index to be created // Create an index writer IndexWriter writer = new IndexWriter(dir, iwc); // Index the documents indexDocs(writer, resourcesDir); long endTime = System.nanoTime(); LuceneIndexReader.getInstance().initializeIndexReader(writer); writer.close(); log.info("Took: " + (endTime - startTime) / Math.pow(10, 6) + " milliseconds to generate the index."); } catch (IOException e) { log.error("IO Exception Thrown while updating index " + e.getMessage() + "\n"); } }