Example usage for org.apache.lucene.index IndexWriter getConfig

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter getConfig.

Prototype

public LiveIndexWriterConfig getConfig()

Source Link

Document

Returns a LiveIndexWriterConfig , which can be used to query the IndexWriter current settings, as well as modify "live" ones.

Usage

From source file:edu.cmu.lti.huiying.ir.rangedsearch.TableIndexer.java

License:Apache License

public void indexOffsetAnnotation(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    if (files[i].equals("NeuroScience.num.offset"))
                        indexOffsetAnnotation(writer, new File(file, files[i]));
                }//from ww w  .  j a  v a  2  s .  c om
            }
        } else {
            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();
                BufferedReader br = new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8));
                String line = null;
                String filename = null;
                while ((line = br.readLine()) != null) {
                    if (line.trim().length() == 0) {
                        doc.add((new StringField("filename", filename, Field.Store.YES)));
                        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                            writer.addDocument(doc);
                        } else {
                            System.out.println("updating " + file);
                            writer.updateDocument(new Term("path", file.getPath()), doc);
                        }
                        doc = new Document();
                        filename = null;
                        continue;
                    }
                    String[] spl = line.split("\t");
                    doc.add(new DoubleField(spl[3], Double.parseDouble(spl[5]), Field.Store.YES));
                    if (filename == null)
                        filename = spl[0];
                }
                br.close();
            } finally {
                fis.close();
            }
        }
    }
}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.plain.PlainCorpusBuilder.java

License:Open Source License

/**
 * Indexes the given file using the given writer, or if a directory is given, recurses over
 * files and directories found under the given directory.
 *
 *
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException//from   ww  w. j a va  2  s  .c  om
 */
static void indexDocs(IndexWriter writer, Document doc) {

    if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
        try {
            writer.addDocument(doc);
        } catch (IOException ex1) {
            Logger.getLogger(PlainCorpusBuilder.class.getName()).log(Level.SEVERE, null, ex1);
        }
    } else {
    }

}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.wikipedia.WikiCorpusBuilder.java

License:Open Source License

/**
 * Indexes the given file using the given writer, or if a directory is given, recurses over
 * files and directories found under the given directory.
 *
 *
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException/*  w w  w .j  av a 2  s .c o  m*/
 */
static void indexDocs(IndexWriter writer, Document doc) {

    if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
        try {

            writer.addDocument(doc);
        } catch (IOException ex1) {
            Logger.getLogger(WikiCorpusBuilder.class.getName()).log(Level.SEVERE, null, ex1);
        }
    } else {
    }
}

From source file:edu.uci.ics.cs221wiki.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * /*ww w.  j a va  2 s. co m*/
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new Field("path", file.getPath(), Field.Store.YES,
                        Field.Index.NOT_ANALYZED_NO_NORMS);
                pathField.setIndexOptions(IndexOptions.DOCS_ONLY);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a NumericField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                NumericField modifiedField = new NumericField("modified");
                modifiedField.setLongValue(file.lastModified());
                doc.add(modifiedField);

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.

                HTMLTextParser htp = new HTMLTextParser();
                htp.HTMLtoTextParser(file.getPath());
                String parsedFileName = file.getPath().substring(file.getPath().lastIndexOf("/") + 1,
                        file.getPath().indexOf(".") - 1);
                parsedFileName = "./output/" + parsedFileName + ".txt";
                //System.out.println(file.getPath());
                //System.out.println(parsedFileName);

                FileInputStream parsedFis = new FileInputStream(parsedFileName);
                doc.add(new Field("contents", new BufferedReader(new InputStreamReader(parsedFis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:edu.uci.ics.searcher.IndexFiles.java

License:Apache License

/**
 * Add a url and its content to the index.
 * //w ww  .  java  2  s . c  om
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param url The url string
 * @param url_text_path Content file of the url
 */
static private void addDoc(IndexWriter writer, String url, String docsPath, String fileName) {
    Document doc = new Document();
    try {
        // add url
        doc.add(new StringField("url", url, Field.Store.YES));
        // add contents
        FileInputStream fis = new FileInputStream(docsPath + "Textdata/" + fileName);
        doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));
        // add title
        String title = HtmlParser.getTitle(docsPath + "Htmldata/" + fileName);
        doc.add(new TextField("title", title, Field.Store.YES));
        // add length
        File f = new File(docsPath + "Textdata/" + fileName);
        doc.add(new LongField("length", f.length(), Field.Store.YES));

        // Document-level boost
        //doc.setBoost(1.0f);

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + url);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            System.out.println("updating " + url);
            writer.updateDocument(new Term("url", url), doc);
        }
    } catch (Exception e) {
        System.err.println(e.getMessage());
    }
}

From source file:es.unizar.iaaa.crawler.butler.index.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is
 * given, recurses over files and directories found under the given
 * directory.//from ww  w .ja  v  a  2 s.  c  om
 *
 * NOTE: This method indexes one document per input file. This is slow. For
 * good throughput, put multiple documents into your input file(s). An
 * example of this is in the benchmark module, which can create "line doc"
 * files, one document per line, using the <a href=
 * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>. If there is a low-level I/O error
 */
public void indexDocs(IndexWriter writer, File file) throws IOException {
    // file ins salida.txt where the structure is (URI + parsetext)+
    /*
     * Recno:: 0 URL:: http://www.unizar.es/
     *
     * ParseText:: blablabla
     */

    if (file.canRead()) {

        try (Scanner scan = new Scanner(new FileInputStream(file))) {

            // make a new, empty document

            // Add the path of the file as a field named "path". Use a
            // field that is indexed (i.e. searchable), but don't
            // tokenize
            // the field into separate words and don't index term
            // frequency
            // or positional information:
            // Add the last modified date of the file a field named
            // "modified".
            // Use a LongField that is indexed (i.e. efficiently
            // filterable with
            // NumericRangeFilter). This indexes to milli-second
            // resolution, which
            // is often too fine. You could instead create a number
            // based on
            // year/month/day/hour/minutes/seconds, down the resolution
            // you require.
            // For example the long value 2011021714 would mean
            // February 17, 2011, 2-3 PM.
            LOGGER.info("adding ");
            int i = 0;
            while (scan.hasNextLine()) {
                String line = scan.nextLine();
                if (line.contains("Recno::")) {
                    // fichero
                    String url = scan.nextLine();
                    scan.nextLine();
                    scan.nextLine();
                    String content = scan.nextLine();
                    url = url.replace("URL:: ", "");

                    Document doc = new Document();
                    insertInIndex(url, "url", doc, "text");
                    insertInIndex(content, "content", doc, "text");

                    // ya se ha aacbado el fichero
                    if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                        // New index, so we just add the
                        // document (no old
                        // document can be there):

                        writer.addDocument(doc);
                    } else {
                        // Existing index (an old copy of this
                        // document may have
                        // been indexed) so
                        // we use updateDocument instead to
                        // replace the old one
                        // matching the exact
                        // path, if present:

                        writer.updateDocument(new Term("path", file.getPath()), doc);
                    }
                    if (i % 100 == 0)
                        LOGGER.info(i + " lines");
                    i++;

                }
                // siguiente linea
            }

            // Add the contents of the file to a field named "contents".
            // Specify a Reader,
            // so that the text of the file is tokenized and indexed,
            // but not stored.
            // Note that FileReader expects the file to be in UTF-8
            // encoding.
            // If that's not the case searching for special characters
            // will fail
            LOGGER.info("added " + i);
        }
    }
}

From source file:gov.ssa.test.lucenedemo.IndexFiles.java

/**
 * Indexes a single document/*from  w w  w. ja v a 2s . c om*/
 */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();
        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);
        // Add the last modified date of the file a field named "modified".
        // Use a LongPoint that is indexed (i.e. efficiently filterable with
        // PointRangeQuery).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongPoint("modified", lastModified));
        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));
        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:index.Indexcategory.java

public static void main(String[] args) throws IOException {
    String indexPath = "/Users/smita/Documents/ES/index/abstract/";
    String docsPath = null;/*from   w  ww.  ja v a  2s . c  o m*/
    boolean create = true;

    String path = "/Users/smita/Documents/data/dbpedia/short_abstracts_en.nq";
    Directory dir = FSDirectory.open(Paths.get(indexPath));
    Analyzer analyzer = new StandardAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
    if (create) {
        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    } else {
        iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    }
    IndexWriter writer = new IndexWriter(dir, iwc);

    FileInputStream inputStream = null;
    Scanner sc = null;
    try {

        int linecount = 0;
        inputStream = new FileInputStream(path);
        sc = new Scanner(inputStream, "UTF-8");
        String ignore = sc.nextLine();
        while (sc.hasNextLine()) {
            linecount++;
            String line = sc.nextLine();
            //System.out.println(line);
            try {
                String article = line.split("> ")[0];
                String category = line.split("> ")[2];
                //System.out.println(article+" ++ "+category);

                //index row

                article = article.substring(29, article.length() - 1);
                //category=category.substring(38,category.length()-1);
                //System.out.println(article+"    "+category);

                Document doc = new Document();
                doc.add(new TextField("article", article, Field.Store.YES));
                doc.add(new TextField("category", category, Field.Store.YES));
                if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
                    System.out.println("adding " + linecount);
                    writer.addDocument(doc);

                } else {
                    System.out.println("updating ");
                    //writer.updateDocument(new Term("path", file.toString()), doc);
                }
            } catch (Exception e) {
            }

        }
        if (sc.ioException() != null) {
            throw sc.ioException();
        }

    } finally {
        if (inputStream != null) {
            inputStream.close();
        }
        if (sc != null) {
            sc.close();
        }
    }

    writer.close();
}

From source file:index.IndexCoreMeta.java

private static void readFile(IndexWriter writer, String filename)
        throws FileNotFoundException, JSONException, IOException {

    FileInputStream inputStream = null;
    Scanner sc = null;//from w w w . j  a va2s .co m
    try {

        int linecount = 0;
        inputStream = new FileInputStream(filename);
        sc = new Scanner(inputStream, "UTF-8");
        //String hash = sc.nextLine();
        while (sc.hasNextLine()) {
            String id = "";
            String title = "NA";
            String date = "";
            String abs = "NA";
            String[] authors = null;
            Document doc = new Document();

            linecount++;
            String line = sc.nextLine();
            try {
                JSONObject obj = new JSONObject(line);
                //System.out.println(obj.length());
                //                
                id = obj.get("identifier").toString();
                doc.add(new TextField("id", id, Field.Store.YES));
                //String type=obj.get("dc:type").toString();
                //document.addField("type", type);
                try {
                    title = obj.get("bibo:shortTitle").toString();
                    doc.add(new TextField("title", title, Field.Store.YES));
                    //                date = obj.get("dc:date").toString();
                    //                doc.add(new TextField("date", date, Field.Store.YES));
                } catch (Exception e2) {
                }

                try {

                    abs = obj.get("bibo:abstract").toString();
                    doc.add(new TextField("abstract", abs, Field.Store.YES));

                    //System.out.println(linecount + "," + abs);

                } catch (Exception e) {
                }
                //                JSONArray arr = obj.getJSONArray("bibo:AuthorList");
                //                if (arr != null) {
                //                    for (int i = 0; i < arr.length(); i++) {
                //                        doc.add(new TextField("author", arr.get(i).toString(), Field.Store.YES));
                //                        //System.out.println(arr.get(i).toString());
                //                    }
                if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
                    //System.out.println("adding " + linecount);
                    writer.addDocument(doc);

                } else {
                    //System.out.println("updating ");
                    //writer.updateDocument(new Term("path", file.toString()), doc);
                }

            }

            catch (Exception e3) {
            }

        }

        // note that Scanner suppresses exceptions
        if (sc.ioException() != null) {
            throw sc.ioException();
        }

    }

    finally {
        if (inputStream != null) {
            inputStream.close();
        }
        if (sc != null) {
            sc.close();
        }
    }

    writer.commit();

}

From source file:index.IndexEx.java

static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        Document doc = new Document();
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);// ww w  .j a  v  a2  s.c  om
        doc.add(new LongField("modified", lastModified, Field.Store.NO));
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));
        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            System.out.println("adding " + file);
            writer.addDocument(doc);

        } else {
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}