Example usage for org.apache.lucene.index IndexWriter getConfig

List of usage examples for org.apache.lucene.index IndexWriter getConfig

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter getConfig.

Prototype

public LiveIndexWriterConfig getConfig() 

Source Link

Document

Returns a LiveIndexWriterConfig , which can be used to query the IndexWriter current settings, as well as modify "live" ones.

Usage

From source file:com.yangxu.searchengine.index.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is
 * given, recurses over files and directories found under the given
 * directory./* w w w  .j  a  v  a  2 s .c  o  m*/
 * 
 * NOTE: This method indexes one document per input file. This is slow. For
 * good throughput, put multiple documents into your input file(s). An
 * example of this is in the benchmark module, which can create "line doc"
 * files, one document per line, using the <a href=
 * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 * 
 * @param writer
 *            Writer to the index where the given file/dir info will be
 *            stored
 * @param file
 *            The file to index, or the directory to recurse into to find
 *            files to index
 * @throws IOException
 */
private void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    // title
    String titleValue = null;
    // content
    String contentValue = null;
    String urlValue = null;
    String indextimeValue = null;
    String uploadtimeValue = null;
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {
            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
                LineNumberReader reader = new LineNumberReader(new InputStreamReader(fis, "UTF-8"));
                String line = null;
                StringBuilder sb = new StringBuilder();
                while ((line = reader.readLine()) != null) {
                    // int lineNumber = reader.getLineNumber();
                    switch (reader.getLineNumber()) {
                    case 1:
                        urlValue = line;
                        break;
                    case 2:
                        uploadtimeValue = line;
                        break;
                    case 3:
                        titleValue = line.split(":")[1];
                        break;
                    case 4:
                        break;
                    default:
                        sb.append(line);
                        break;

                    }
                    /*
                     * if (reader.getLineNumber() == 1) { urlValue = line; }
                     * 
                     * if (reader.getLineNumber() == 3) { titleValue =
                     * line.split(":")[1]; } else if (reader.getLineNumber()
                     * > 4) { sb.append(line); }
                     */
                }
                contentValue = sb.toString();
                reader.close();

            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this
                // exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {
                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path". Use a
                // field that is indexed (i.e. searchable), but don't
                // tokenize
                // the field into separate words and don't index term
                // frequency
                // or positional information:

                Field urlField = new Field("url", urlValue, Field.Store.YES, Field.Index.NOT_ANALYZED);
                urlField.setIndexOptions(IndexOptions.DOCS_ONLY);
                doc.add(urlField);

                Field titleField = new Field("title", titleValue, Field.Store.YES, Field.Index.ANALYZED);
                titleField.setIndexOptions(IndexOptions.DOCS_ONLY);
                doc.add(titleField);

                Field contentField = new Field("content", contentValue, Field.Store.YES, Field.Index.ANALYZED);
                contentField.setIndexOptions(IndexOptions.DOCS_ONLY);
                doc.add(contentField);
                // Add the last modified date of the file a field named
                // "modified".
                // Use a NumericField that is indexed (i.e. efficiently
                // filterable with
                // NumericRangeFilter). This indexes to milli-second
                // resolution, which
                // is often too fine. You could instead create a number
                // based on
                // year/month/day/hour/minutes/seconds, down the resolution
                // you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.               
                SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMdd HH:mm:ss");
                //Calendar cal = Calendar.getInstance(); // 
                //timeValue = formatter.format(cal.getTime());
                Date now = new Date();
                indextimeValue = formatter.format(now);

                Field indextimeField = new Field("indextime", indextimeValue, Field.Store.YES,
                        Field.Index.NOT_ANALYZED);
                titleField.setIndexOptions(IndexOptions.DOCS_ONLY);
                doc.add(indextimeField);

                Field uploadtimeField = new Field("uploadtime", uploadtimeValue, Field.Store.YES,
                        Field.Index.NOT_ANALYZED);
                titleField.setIndexOptions(IndexOptions.DOCS_ONLY);
                doc.add(uploadtimeField);

                // Add the contents of the file to a field named "contents".
                // Specify a Reader,
                // so that the text of the file is tokenized and indexed,
                // but not stored.
                // Note that FileReader expects the file to be in UTF-8
                // encoding.
                // If that's not the case searching for special characters
                // will fail.
                // doc.add(new Field("contents", new BufferedReader(
                // new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been indexed) so
                    // we use updateDocument instead to replace the old one
                    // matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("url", urlValue), doc);
                    writer.updateDocument(new Term("title", titleValue), doc);
                    writer.updateDocument(new Term("content", contentValue), doc);
                    writer.updateDocument(new Term("indextime", String.valueOf(indextimeValue)), doc);
                    writer.updateDocument(new Term("uploadtime", String.valueOf(uploadtimeValue)), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.zghw.lucene.demo.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * /* w w  w.ja v a  2s  .  c  o  m*/
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents",
                        new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.zsq.lucene.chapter1.IndexFiles.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);// w  w w  . ja v  a  2  s.  c  o  m

        // Add the last modified date of the file a field named "modified".
        // Use a LongField that is indexed (i.e. efficiently filterable with
        // NumericRangeFilter).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 4 would mean
        // February 17, 1, 2-3 PM.
        doc.add(new LongField("modified", lastModified, Field.Store.NO));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:concurrency.IndexFiles.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path". Use a
        // field that is indexed (i.e. searchable), but don't tokenize
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);// w  w w.j  a  v a 2 s  .  c om

        // Add the last modified date of the file a field named "modified".
        // Use a LongField that is indexed (i.e. efficiently filterable with
        // NumericRangeFilter). This indexes to milli-second resolution,
        // which
        // is often too fine. You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you
        // require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongField("modified", lastModified, Field.Store.NO));

        // Add the contents of the file to a field named "contents". Specify
        // a Reader,
        // so that the text of the file is tokenized and indexed, but not
        // stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will
        // fail.
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can
            // be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been
            // indexed) so
            // we use updateDocument instead to replace the old one matching
            // the exact
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:cs412.project.search.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * //from   ww  w.  j a  v a  2  s  .  c om
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.YES));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    //            System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                    writer.updateDocument(new Term("modified", Long.toString(file.lastModified())), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:cs571.proj1.IndexFiles.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    if (checkForIgnoredFile(file.getFileName().toString()))
        return;//from   www .j  av  a  2s. c  o  m
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:

        // Add the last modified date of the file a field named "modified".
        // Use a LongField that is indexed (i.e. efficiently filterable with
        // NumericRangeFilter).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.

        //doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        BufferedReader br = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
        String line, docNO = null, docID = null, s;
        StringBuilder sb = new StringBuilder();
        Field docIDField = null;
        Document doc = new Document();

        boolean docFound = false;
        //      boolean testOut = false;
        //      PrintWriter pw = new PrintWriter("out.txt");
        while ((line = br.readLine()) != null) {
            line = line.trim();
            //To-do detect document boundaries
            if (line.startsWith(docNO_start)) {
                docNO = removeTags(line);
                continue;
            }
            if (line.startsWith(docID_start)) {
                docID = removeTags(line);
                continue;
            }

            //         if(checkForIgnoreStartsWith(line)){continue;}
            //         if(checkForIgnoreLine(line)){continue;}

            if (line.equals(docStart)) {
                docFound = true;
                //testOut = true;
                continue;
            }

            if (line.equals(docEnd)) {
                docFound = false;

                if (docNO != null) {
                    docIDField = new StringField("docID", docNO, Field.Store.YES);
                    doc.add(docIDField);
                } else if (docID != null) {
                    docIDField = new StringField("docID", docID, Field.Store.YES);
                    doc.add(docIDField);
                } else {
                    continue;
                }

                //doc.add(new TermVector("contents",sb.toString(),Field.Store.NO));
                if (tfidf || bm25) {
                    FieldType tv = new FieldType();
                    tv.setTokenized(true);
                    tv.setStoreTermVectors(true);
                    tv.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
                    s = sb.toString();
                    byte[] bytes = s.getBytes();
                    ByteArrayInputStream bstream = new ByteArrayInputStream(bytes);
                    InputStreamReader isr = new InputStreamReader(bstream);
                    doc.add(new Field("contents", isr, tv));
                } else {
                    doc.add(new TextField("contents", sb.toString(), Field.Store.NO));
                }

                Field pathField = new StringField("path", file.toString(), Field.Store.YES);
                doc.add(pathField);
                doc.add(new LongField("modified", lastModified, Field.Store.NO));
                numOfDocuments++;
                sb.setLength(0);
                //             testOut = false;
                //             pw.close();
                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.toString()), doc);
                }
                doc = new Document();
                continue;
            }

            if (docFound) {
                sb.append(line).append("\n");
                //            if(testOut) pw.println(line);
            }

        }

    }
}

From source file:de.hsmannheim.ss15.alr.searchengine.DefaultLuceneController.java

/**
 * Indexes a single document//from  w w  w  .  j  a va2  s.  c  o  m
 */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);

        // Add the last modified date of the file a field named "modified".
        // Use a LongField that is indexed (i.e. efficiently filterable with
        // NumericRangeFilter).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongField("modified", lastModified, Field.Store.NO));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        BufferedReader reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));

        List<String> lines = new ArrayList<>();
        while (reader.ready()) {
            lines.add(reader.readLine());
        }
        if (lines.size() > 0) {
            String urlLine = lines.remove(0);
            if (urlLine != null && urlLine.startsWith("URL:")) {
                urlLine = urlLine.substring(4);
                doc.add(new TextField("URL", urlLine, Field.Store.YES));
            }
        }
        if (lines.size() > 0) {
            String dataType = lines.remove(0);
            if (dataType != null && dataType.startsWith("DataType:")) {
                dataType = dataType.substring(9);
                doc.add(new TextField("DataType", dataType, Field.Store.YES));
            }
        }
        if (lines.size() > 0) {
            String title = lines.remove(0);
            if (title != null && title.startsWith("Title:")) {
                title = title.substring(6);
                doc.add(new TextField("title", title, Field.Store.YES));
            }
        }
        String content = "";
        for (String s : lines) {
            content = content + s;
        }
        doc.add(new TextField("contents", content, Field.Store.NO));

        if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:edu.albany.ir.example.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is
 * given, recurses over files and directories found under the given
 * directory./*from  w  w w  . j  a  v  a 2  s. c o m*/
 * 
 * NOTE: This method indexes one document per input file. This is slow. For
 * good throughput, put multiple documents into your input file(s). An
 * example of this is in the benchmark module, which can create "line doc"
 * files, one document per line, using the <a href=
 * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 * 
 * @param writer
 *            Writer to the index where the given file/dir info will be
 *            stored
 * @param file
 *            The file to index, or the directory to recurse into to find
 *            files to index
 * @throws IOException
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this
                // exception with an "access denied" message
                // checking if the file can be read doesn't help
                fnfe.printStackTrace();
                return;
            }

            try {

                // our code
                // *********************************************************
                String record = null;
                int a, b, stringNum = 0, i = 0;
                // String[] docContents = new String[1000];
                // String[] docNos = new String[10000];
                String docName = null;
                // make a new, empty document
                Document doc = new Document();

                BufferedReader reader = new BufferedReader(new InputStreamReader(fis));

                record = new String();
                while ((record = reader.readLine()) != null) {
                    a = record.lastIndexOf("<DOCNO>");
                    b = record.indexOf("</DOCNO>");

                    if (a >= 0 && b > 0) // if this line contains the DOCNO
                    {
                        stringNum++;
                        // docNos[stringNum] = record.substring(a+7,b-1);
                        docName = record.substring(a + 7, b).trim();
                        // add a document
                        if (stringNum >= 1) {
                            // index previous document
                            if (stringNum >= 2)
                                writer.addDocument(doc);

                            // start new document
                            doc = new Document();
                            // doc.add(new Field("path", file.getPath()+
                            // "/"+docName,
                            // Add the path of the file as a field named
                            // "path". Use a
                            // field that is indexed (i.e. searchable), but
                            // don't tokenize
                            // the field into separate words and don't index
                            // term frequency
                            // or positional information:
                            Field pathField = new Field("path", docName, Field.Store.YES,
                                    Field.Index.NOT_ANALYZED_NO_NORMS);
                            pathField.setOmitTermFreqAndPositions(true);
                            doc.add(pathField);
                            // doc.add(new Field("path", docName,
                            // Field.Store.YES,
                            // Field.Index.UN_TOKENIZED));
                            // System.out.println("adding " +
                            // file.getPath()+ "/"+docName);
                            System.out.println("adding " + docName);

                            // Add the last modified date of the file a
                            // field named "modified".
                            // Use a NumericField that is indexed (i.e.
                            // efficiently filterable with
                            // NumericRangeFilter). This indexes to
                            // milli-second resolution, which
                            // is often too fine. You could instead create a
                            // number based on
                            // year/month/day/hour/minutes/seconds, down the
                            // resolution you require.
                            // For example the long value 2011021714 would
                            // mean
                            // February 17, 2011, 2-3 PM.
                            NumericField modifiedField = new NumericField("modified");
                            modifiedField.setLongValue(file.lastModified());
                            doc.add(modifiedField);

                            // doc.add(new Field("modified",
                            // DateField.timeToString(file.lastModified()),
                            // Field.Store.YES,
                            // Field.Index.UN_TOKENIZED));
                        }
                    } else {

                        doc.add(new Field("contents", record, Field.Store.YES, Field.Index.ANALYZED, // tokenized
                                Field.TermVector.YES));
                        // docContents[stringNum] = docContents[stringNum] +
                        // record;
                        // add contents to document
                        // Add the contents of the file to a field named
                        // "contents". Specify a Reader,
                        // so that the text of the file is tokenized and
                        // indexed, but not stored.
                        // Note that FileReader expects the file to be in
                        // UTF-8 encoding.
                        // If that's not the case searching for special
                        // characters will fail.
                        // doc.add(new Field("contents", new
                        // BufferedReader(new InputStreamReader(fis,
                        // "UTF-8"))));
                    }
                    a = 0;
                    b = 0;
                }

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document can be there):
                    System.out.println("adding " + docName);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been indexed) so
                    // we use updateDocument instead to replace the old one
                    // matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:edu.cmu.cs.in.search.HoopLuceneIndex.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * /*from   w w  w.  ja  v  a 2s  .  c  o  m*/
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();

            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {
            FileInputStream fis;

            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {
                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }
            } finally {
                fis.close();
            }
        }
    }
}

From source file:edu.cmu.lti.huiying.ir.rangedsearch.TableIndexer.java

License:Apache License

public void indexExplodedXml(IndexWriter writer, File file) throws IOException {
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexExplodedXml(writer, new File(file, files[i]));
                }/*from   w w w .  ja  va2  s  .  c o m*/
            }
        } else {
            FileInputStream fis = new FileInputStream(file);
            try {
                NumericFeatureGenerator nfg = new NumericFeatureGenerator();
                if (this.xmlreader == null) {
                    this.xmlreader = new XmlStAXReader();
                }
                Article a = xmlreader.readArticleFromXml(file.getAbsolutePath());
                for (Table t : a.tables) {
                    for (Group g : t.groups) {
                        for (Column col : g.columns) {
                            // index columns
                            Document coldoc = new Document();
                            ArrayList<Double> cfv = nfg.getFeatureVector(col.content);
                            if (cfv.get(0) != null) {
                                DoubleField intratio = new DoubleField("intratio", cfv.get(0), Field.Store.NO);
                                coldoc.add(intratio);
                            }
                            if (cfv.get(1) != null) {
                                DoubleField floatratio = new DoubleField("floatratio", cfv.get(1),
                                        Field.Store.NO);
                                coldoc.add(floatratio);
                            }
                            if (cfv.get(3) != null) {
                                DoubleField mean = new DoubleField("mean", cfv.get(3), Field.Store.NO);
                                coldoc.add(mean);
                            }
                            if (cfv.get(4) != null) {
                                DoubleField std = new DoubleField("std", cfv.get(4), Field.Store.NO);
                                coldoc.add(std);
                            }
                            if (cfv.get(6) != null) {
                                DoubleField min = new DoubleField("min", cfv.get(6), Field.Store.NO);
                                coldoc.add(min);
                            }
                            if (cfv.get(7) != null) {
                                DoubleField max = new DoubleField("max", cfv.get(7), Field.Store.NO);
                                coldoc.add(max);
                            }
                            if (cfv.get(8) != null) {
                                DoubleField acc = new DoubleField("acc", cfv.get(8), Field.Store.NO);
                                coldoc.add(acc);
                            }
                            if (cfv.get(11) != null) {
                                DoubleField colmag = new DoubleField("colmag", cfv.get(11), Field.Store.NO);
                                coldoc.add(colmag);
                            }

                            StringField wholegroup = new StringField("wholegroup", g.toString(),
                                    Field.Store.YES);
                            if (wholegroup.stringValue().getBytes().length > 32760) {
                                wholegroup.setStringValue("Table too large...");
                                System.err.println(
                                        "table too large:" + wholegroup.stringValue().getBytes().length);

                            }
                            String headers = "";
                            if (col.headers != null) {
                                for (Header hdr : col.headers) {
                                    headers += hdr.text.toLowerCase() + " ";
                                }
                            }
                            TextField header = new TextField("headerkeywords", headers.trim(), Field.Store.NO);
                            coldoc.add(header);
                            coldoc.add(wholegroup);
                            StringField fname = new StringField("filename", file.getAbsolutePath(),
                                    Field.Store.YES);
                            coldoc.add(fname);
                            StringField type = new StringField("type", "column", Field.Store.YES);
                            coldoc.add(type);
                            IntField bstart = new IntField("bytestart", col.content.get(0).byteStart,
                                    Field.Store.YES);
                            IntField bend = new IntField("byteend",
                                    col.content.get(col.content.size() - 1).byteEnd, Field.Store.YES);
                            String content = "";
                            for (edu.cmu.lti.huiying.domainclasses.Field f : col.content)
                                content += f.text + "|";
                            StringField colcontent = new StringField("colcontent",
                                    content.substring(0, content.length() - 1), Field.Store.YES);
                            coldoc.add(colcontent);
                            coldoc.add(bstart);
                            coldoc.add(bend);
                            if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                                writer.addDocument(coldoc);
                                totalDocAdded++;
                            } else {
                                writer.updateDocument(new Term("path", file.getPath()), coldoc);
                            }
                            for (edu.cmu.lti.huiying.domainclasses.Field f : col.content) {
                                // Index single cell
                                Document celldoc = new Document();
                                ArrayList<Double> fv = nfg.field2Features(f);
                                if (fv.get(0) == 1 || fv.get(0) == 2) {
                                    try {
                                        DoubleField df = new DoubleField("value", fv.get(1), Field.Store.YES);
                                        celldoc.add(df);
                                        StringField textf = new StringField("text", f.text, Field.Store.YES);
                                        celldoc.add(textf);
                                        if (fv.get(2) != null & fv.get(2) != Double.NaN) {
                                            DoubleField errf = new DoubleField("error", fv.get(2),
                                                    Field.Store.NO);
                                            celldoc.add(errf);
                                        }
                                        if (fv.get(5) != Double.NaN) {
                                            DoubleField magf = new DoubleField("cellmag", fv.get(5),
                                                    Field.Store.NO);
                                            celldoc.add(magf);
                                        }
                                        if (fv.get(4) != null) {
                                            DoubleField pvalue = new DoubleField("cellpvalue", fv.get(4),
                                                    Field.Store.NO);
                                            celldoc.add(pvalue);
                                        }
                                        StringField sf = new StringField("filename", file.getAbsolutePath(),
                                                Field.Store.YES);
                                        celldoc.add(sf);

                                        StringField ctype = new StringField("type", "cell", Field.Store.YES);
                                        celldoc.add(ctype);
                                        //StringField cwholegroup=new StringField("wholegroup", g.toString(), Field.Store.YES);
                                        //celldoc.add(cwholegroup);
                                        IntField cbstart = new IntField("bytestart", f.byteStart,
                                                Field.Store.YES);
                                        IntField cbend = new IntField("byteend", f.byteEnd, Field.Store.YES);
                                        celldoc.add(cbstart);
                                        celldoc.add(cbend);
                                    } catch (NullPointerException e) {
                                        e.printStackTrace();
                                        System.out.println(f.text);
                                    }
                                    if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                                        writer.addDocument(celldoc);
                                        totalDocAdded++;
                                    } else {
                                        writer.updateDocument(new Term("path", file.getPath()), celldoc);
                                    }
                                }
                            }
                        }
                    }
                }

            } finally {
                fis.close();
            }
        }
    }
}