List of usage examples for org.apache.lucene.index IndexWriter getConfig
public LiveIndexWriterConfig getConfig()
From source file:edu.cmu.lti.huiying.ir.rangedsearch.TableIndexer.java
License:Apache License
public void indexOffsetAnnotation(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { if (files[i].equals("NeuroScience.num.offset")) indexOffsetAnnotation(writer, new File(file, files[i])); }//from ww w . j a v a 2 s . c om } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { return; } try { // make a new, empty document Document doc = new Document(); BufferedReader br = new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8)); String line = null; String filename = null; while ((line = br.readLine()) != null) { if (line.trim().length() == 0) { doc.add((new StringField("filename", filename, Field.Store.YES))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { writer.addDocument(doc); } else { System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } doc = new Document(); filename = null; continue; } String[] spl = line.split("\t"); doc.add(new DoubleField(spl[3], Double.parseDouble(spl[5]), Field.Store.YES)); if (filename == null) filename = spl[0]; } br.close(); } finally { fis.close(); } } } }
From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.plain.PlainCorpusBuilder.java
License:Open Source License
/** * Indexes the given file using the given writer, or if a directory is given, recurses over * files and directories found under the given directory. * * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException//from ww w. j a va 2 s .c om */ static void indexDocs(IndexWriter writer, Document doc) { if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) { try { writer.addDocument(doc); } catch (IOException ex1) { Logger.getLogger(PlainCorpusBuilder.class.getName()).log(Level.SEVERE, null, ex1); } } else { } }
From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.wikipedia.WikiCorpusBuilder.java
License:Open Source License
/** * Indexes the given file using the given writer, or if a directory is given, recurses over * files and directories found under the given directory. * * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException/* w w w .j av a 2 s .c o m*/ */ static void indexDocs(IndexWriter writer, Document doc) { if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) { try { writer.addDocument(doc); } catch (IOException ex1) { Logger.getLogger(WikiCorpusBuilder.class.getName()).log(Level.SEVERE, null, ex1); } } else { } }
From source file:edu.uci.ics.cs221wiki.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * /*ww w. j a va 2 s. co m*/ * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new Field("path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); pathField.setIndexOptions(IndexOptions.DOCS_ONLY); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a NumericField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. NumericField modifiedField = new NumericField("modified"); modifiedField.setLongValue(file.lastModified()); doc.add(modifiedField); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. HTMLTextParser htp = new HTMLTextParser(); htp.HTMLtoTextParser(file.getPath()); String parsedFileName = file.getPath().substring(file.getPath().lastIndexOf("/") + 1, file.getPath().indexOf(".") - 1); parsedFileName = "./output/" + parsedFileName + ".txt"; //System.out.println(file.getPath()); //System.out.println(parsedFileName); FileInputStream parsedFis = new FileInputStream(parsedFileName); doc.add(new Field("contents", new BufferedReader(new InputStreamReader(parsedFis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:edu.uci.ics.searcher.IndexFiles.java
License:Apache License
/** * Add a url and its content to the index. * //w ww . java 2 s . c om * @param writer Writer to the index where the given file/dir info will be stored * @param url The url string * @param url_text_path Content file of the url */ static private void addDoc(IndexWriter writer, String url, String docsPath, String fileName) { Document doc = new Document(); try { // add url doc.add(new StringField("url", url, Field.Store.YES)); // add contents FileInputStream fis = new FileInputStream(docsPath + "Textdata/" + fileName); doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); // add title String title = HtmlParser.getTitle(docsPath + "Htmldata/" + fileName); doc.add(new TextField("title", title, Field.Store.YES)); // add length File f = new File(docsPath + "Textdata/" + fileName); doc.add(new LongField("length", f.length(), Field.Store.YES)); // Document-level boost //doc.setBoost(1.0f); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + url); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + url); writer.updateDocument(new Term("url", url), doc); } } catch (Exception e) { System.err.println(e.getMessage()); } }
From source file:es.unizar.iaaa.crawler.butler.index.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is * given, recurses over files and directories found under the given * directory.//from ww w .ja v a 2 s. c om * * NOTE: This method indexes one document per input file. This is slow. For * good throughput, put multiple documents into your input file(s). An * example of this is in the benchmark module, which can create "line doc" * files, one document per line, using the <a href= * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. If there is a low-level I/O error */ public void indexDocs(IndexWriter writer, File file) throws IOException { // file ins salida.txt where the structure is (URI + parsetext)+ /* * Recno:: 0 URL:: http://www.unizar.es/ * * ParseText:: blablabla */ if (file.canRead()) { try (Scanner scan = new Scanner(new FileInputStream(file))) { // make a new, empty document // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't // tokenize // the field into separate words and don't index term // frequency // or positional information: // Add the last modified date of the file a field named // "modified". // Use a LongField that is indexed (i.e. efficiently // filterable with // NumericRangeFilter). This indexes to milli-second // resolution, which // is often too fine. You could instead create a number // based on // year/month/day/hour/minutes/seconds, down the resolution // you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. LOGGER.info("adding "); int i = 0; while (scan.hasNextLine()) { String line = scan.nextLine(); if (line.contains("Recno::")) { // fichero String url = scan.nextLine(); scan.nextLine(); scan.nextLine(); String content = scan.nextLine(); url = url.replace("URL:: ", ""); Document doc = new Document(); insertInIndex(url, "url", doc, "text"); insertInIndex(content, "content", doc, "text"); // ya se ha aacbado el fichero if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the // document (no old // document can be there): writer.addDocument(doc); } else { // Existing index (an old copy of this // document may have // been indexed) so // we use updateDocument instead to // replace the old one // matching the exact // path, if present: writer.updateDocument(new Term("path", file.getPath()), doc); } if (i % 100 == 0) LOGGER.info(i + " lines"); i++; } // siguiente linea } // Add the contents of the file to a field named "contents". // Specify a Reader, // so that the text of the file is tokenized and indexed, // but not stored. // Note that FileReader expects the file to be in UTF-8 // encoding. // If that's not the case searching for special characters // will fail LOGGER.info("added " + i); } } }
From source file:gov.ssa.test.lucenedemo.IndexFiles.java
/** * Indexes a single document/*from w w w. ja v a 2s . c om*/ */ static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongPoint that is indexed (i.e. efficiently filterable with // PointRangeQuery). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongPoint("modified", lastModified)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }
From source file:index.Indexcategory.java
public static void main(String[] args) throws IOException { String indexPath = "/Users/smita/Documents/ES/index/abstract/"; String docsPath = null;/*from w ww. ja v a 2s . c o m*/ boolean create = true; String path = "/Users/smita/Documents/data/dbpedia/short_abstracts_en.nq"; Directory dir = FSDirectory.open(Paths.get(indexPath)); Analyzer analyzer = new StandardAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } else { iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); } IndexWriter writer = new IndexWriter(dir, iwc); FileInputStream inputStream = null; Scanner sc = null; try { int linecount = 0; inputStream = new FileInputStream(path); sc = new Scanner(inputStream, "UTF-8"); String ignore = sc.nextLine(); while (sc.hasNextLine()) { linecount++; String line = sc.nextLine(); //System.out.println(line); try { String article = line.split("> ")[0]; String category = line.split("> ")[2]; //System.out.println(article+" ++ "+category); //index row article = article.substring(29, article.length() - 1); //category=category.substring(38,category.length()-1); //System.out.println(article+" "+category); Document doc = new Document(); doc.add(new TextField("article", article, Field.Store.YES)); doc.add(new TextField("category", category, Field.Store.YES)); if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) { System.out.println("adding " + linecount); writer.addDocument(doc); } else { System.out.println("updating "); //writer.updateDocument(new Term("path", file.toString()), doc); } } catch (Exception e) { } } if (sc.ioException() != null) { throw sc.ioException(); } } finally { if (inputStream != null) { inputStream.close(); } if (sc != null) { sc.close(); } } writer.close(); }
From source file:index.IndexCoreMeta.java
private static void readFile(IndexWriter writer, String filename) throws FileNotFoundException, JSONException, IOException { FileInputStream inputStream = null; Scanner sc = null;//from w w w . j a va2s .co m try { int linecount = 0; inputStream = new FileInputStream(filename); sc = new Scanner(inputStream, "UTF-8"); //String hash = sc.nextLine(); while (sc.hasNextLine()) { String id = ""; String title = "NA"; String date = ""; String abs = "NA"; String[] authors = null; Document doc = new Document(); linecount++; String line = sc.nextLine(); try { JSONObject obj = new JSONObject(line); //System.out.println(obj.length()); // id = obj.get("identifier").toString(); doc.add(new TextField("id", id, Field.Store.YES)); //String type=obj.get("dc:type").toString(); //document.addField("type", type); try { title = obj.get("bibo:shortTitle").toString(); doc.add(new TextField("title", title, Field.Store.YES)); // date = obj.get("dc:date").toString(); // doc.add(new TextField("date", date, Field.Store.YES)); } catch (Exception e2) { } try { abs = obj.get("bibo:abstract").toString(); doc.add(new TextField("abstract", abs, Field.Store.YES)); //System.out.println(linecount + "," + abs); } catch (Exception e) { } // JSONArray arr = obj.getJSONArray("bibo:AuthorList"); // if (arr != null) { // for (int i = 0; i < arr.length(); i++) { // doc.add(new TextField("author", arr.get(i).toString(), Field.Store.YES)); // //System.out.println(arr.get(i).toString()); // } if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) { //System.out.println("adding " + linecount); writer.addDocument(doc); } else { //System.out.println("updating "); //writer.updateDocument(new Term("path", file.toString()), doc); } } catch (Exception e3) { } } // note that Scanner suppresses exceptions if (sc.ioException() != null) { throw sc.ioException(); } } finally { if (inputStream != null) { inputStream.close(); } if (sc != null) { sc.close(); } } writer.commit(); }
From source file:index.IndexEx.java
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { try (InputStream stream = Files.newInputStream(file)) { Document doc = new Document(); Field pathField = new StringField("path", file.toString(), Field.Store.YES); doc.add(pathField);// ww w .j a v a2 s.c om doc.add(new LongField("modified", lastModified, Field.Store.NO)); doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { System.out.println("adding " + file); writer.addDocument(doc); } else { System.out.println("updating " + file); writer.updateDocument(new Term("path", file.toString()), doc); } } }