Example usage for org.apache.lucene.index IndexWriter getConfig

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter getConfig.

Prototype

public LiveIndexWriterConfig getConfig()

Source Link

Document

Returns a LiveIndexWriterConfig , which can be used to query the IndexWriter current settings, as well as modify "live" ones.

Usage

From source file:com.yangxu.searchengine.index.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is
 * given, recurses over files and directories found under the given
 * directory./* w w w  .j  a  v  a  2 s .c  o  m*/
 * 
 * NOTE: This method indexes one document per input file. This is slow. For
 * good throughput, put multiple documents into your input file(s). An
 * example of this is in the benchmark module, which can create "line doc"
 * files, one document per line, using the <a href=
 * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 * 
 * @param writer
 *            Writer to the index where the given file/dir info will be
 *            stored
 * @param file
 *            The file to index, or the directory to recurse into to find
 *            files to index
 * @throws IOException
 */
private void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    // title
    String titleValue = null;
    // content
    String contentValue = null;
    String urlValue = null;
    String indextimeValue = null;
    String uploadtimeValue = null;
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {
            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
                LineNumberReader reader = new LineNumberReader(new InputStreamReader(fis, "UTF-8"));
                String line = null;
                StringBuilder sb = new StringBuilder();
                while ((line = reader.readLine()) != null) {
                    // int lineNumber = reader.getLineNumber();
                    switch (reader.getLineNumber()) {
                    case 1:
                        urlValue = line;
                        break;
                    case 2:
                        uploadtimeValue = line;
                        break;
                    case 3:
                        titleValue = line.split(":")[1];
                        break;
                    case 4:
                        break;
                    default:
                        sb.append(line);
                        break;

                    }
                    /*
                     * if (reader.getLineNumber() == 1) { urlValue = line; }
                     * 
                     * if (reader.getLineNumber() == 3) { titleValue =
                     * line.split(":")[1]; } else if (reader.getLineNumber()
                     * > 4) { sb.append(line); }
                     */
                }
                contentValue = sb.toString();
                reader.close();

            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this
                // exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {
                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path". Use a
                // field that is indexed (i.e. searchable), but don't
                // tokenize
                // the field into separate words and don't index term
                // frequency
                // or positional information:

                Field urlField = new Field("url", urlValue, Field.Store.YES, Field.Index.NOT_ANALYZED);
                urlField.setIndexOptions(IndexOptions.DOCS_ONLY);
                doc.add(urlField);

                Field titleField = new Field("title", titleValue, Field.Store.YES, Field.Index.ANALYZED);
                titleField.setIndexOptions(IndexOptions.DOCS_ONLY);
                doc.add(titleField);

                Field contentField = new Field("content", contentValue, Field.Store.YES, Field.Index.ANALYZED);
                contentField.setIndexOptions(IndexOptions.DOCS_ONLY);
                doc.add(contentField);
                // Add the last modified date of the file a field named
                // "modified".
                // Use a NumericField that is indexed (i.e. efficiently
                // filterable with
                // NumericRangeFilter). This indexes to milli-second
                // resolution, which
                // is often too fine. You could instead create a number
                // based on
                // year/month/day/hour/minutes/seconds, down the resolution
                // you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.               
                SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMdd HH:mm:ss");
                //Calendar cal = Calendar.getInstance(); // 
                //timeValue = formatter.format(cal.getTime());
                Date now = new Date();
                indextimeValue = formatter.format(now);

                Field indextimeField = new Field("indextime", indextimeValue, Field.Store.YES,
                        Field.Index.NOT_ANALYZED);
                titleField.setIndexOptions(IndexOptions.DOCS_ONLY);
                doc.add(indextimeField);

                Field uploadtimeField = new Field("uploadtime", uploadtimeValue, Field.Store.YES,
                        Field.Index.NOT_ANALYZED);
                titleField.setIndexOptions(IndexOptions.DOCS_ONLY);
                doc.add(uploadtimeField);

                // Add the contents of the file to a field named "contents".
                // Specify a Reader,
                // so that the text of the file is tokenized and indexed,
                // but not stored.
                // Note that FileReader expects the file to be in UTF-8
                // encoding.
                // If that's not the case searching for special characters
                // will fail.
                // doc.add(new Field("contents", new BufferedReader(
                // new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been indexed) so
                    // we use updateDocument instead to replace the old one
                    // matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("url", urlValue), doc);
                    writer.updateDocument(new Term("title", titleValue), doc);
                    writer.updateDocument(new Term("content", contentValue), doc);
                    writer.updateDocument(new Term("indextime", String.valueOf(indextimeValue)), doc);
                    writer.updateDocument(new Term("uploadtime", String.valueOf(uploadtimeValue)), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.zghw.lucene.demo.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * /* w w  w.ja v a  2s  .  c  o  m*/
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents",
                        new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:com.zsq.lucene.chapter1.IndexFiles.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);// w  w w  . ja v  a  2  s.  c  o  m

        // Add the last modified date of the file a field named "modified".
        // Use a LongField that is indexed (i.e. efficiently filterable with
        // NumericRangeFilter).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 4 would mean
        // February 17, 1, 2-3 PM.
        doc.add(new LongField("modified", lastModified, Field.Store.NO));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:concurrency.IndexFiles.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path". Use a
        // field that is indexed (i.e. searchable), but don't tokenize
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);// w  w w.j  a  v a 2 s  .  c om

        // Add the last modified date of the file a field named "modified".
        // Use a LongField that is indexed (i.e. efficiently filterable with
        // NumericRangeFilter). This indexes to milli-second resolution,
        // which
        // is often too fine. You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you
        // require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongField("modified", lastModified, Field.Store.NO));

        // Add the contents of the file to a field named "contents". Specify
        // a Reader,
        // so that the text of the file is tokenized and indexed, but not
        // stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will
        // fail.
        doc.add(new TextField("contents",
                new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can
            // be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been
            // indexed) so
            // we use updateDocument instead to replace the old one matching
            // the exact
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:cs412.project.search.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * //from   ww  w.  j a  v a  2  s  .  c om
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.YES));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    //            System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                    writer.updateDocument(new Term("modified", Long.toString(file.lastModified())), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:cs571.proj1.IndexFiles.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    if (checkForIgnoredFile(file.getFileName().toString()))
        return;//from   www .j  av  a  2s. c  o  m
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:

        // Add the last modified date of the file a field named "modified".
        // Use a LongField that is indexed (i.e. efficiently filterable with
        // NumericRangeFilter).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.

        //doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));

        BufferedReader br = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
        String line, docNO = null, docID = null, s;
        StringBuilder sb = new StringBuilder();
        Field docIDField = null;
        Document doc = new Document();

        boolean docFound = false;
        //      boolean testOut = false;
        //      PrintWriter pw = new PrintWriter("out.txt");
        while ((line = br.readLine()) != null) {
            line = line.trim();
            //To-do detect document boundaries
            if (line.startsWith(docNO_start)) {
                docNO = removeTags(line);
                continue;
            }
            if (line.startsWith(docID_start)) {
                docID = removeTags(line);
                continue;
            }

            //         if(checkForIgnoreStartsWith(line)){continue;}
            //         if(checkForIgnoreLine(line)){continue;}

            if (line.equals(docStart)) {
                docFound = true;
                //testOut = true;
                continue;
            }

            if (line.equals(docEnd)) {
                docFound = false;

                if (docNO != null) {
                    docIDField = new StringField("docID", docNO, Field.Store.YES);
                    doc.add(docIDField);
                } else if (docID != null) {
                    docIDField = new StringField("docID", docID, Field.Store.YES);
                    doc.add(docIDField);
                } else {
                    continue;
                }

                //doc.add(new TermVector("contents",sb.toString(),Field.Store.NO));
                if (tfidf || bm25) {
                    FieldType tv = new FieldType();
                    tv.setTokenized(true);
                    tv.setStoreTermVectors(true);
                    tv.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
                    s = sb.toString();
                    byte[] bytes = s.getBytes();
                    ByteArrayInputStream bstream = new ByteArrayInputStream(bytes);
                    InputStreamReader isr = new InputStreamReader(bstream);
                    doc.add(new Field("contents", isr, tv));
                } else {
                    doc.add(new TextField("contents", sb.toString(), Field.Store.NO));
                }

                Field pathField = new StringField("path", file.toString(), Field.Store.YES);
                doc.add(pathField);
                doc.add(new LongField("modified", lastModified, Field.Store.NO));
                numOfDocuments++;
                sb.setLength(0);
                //             testOut = false;
                //             pw.close();
                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.toString()), doc);
                }
                doc = new Document();
                continue;
            }

            if (docFound) {
                sb.append(line).append("\n");
                //            if(testOut) pw.println(line);
            }

        }

    }
}

From source file:de.hsmannheim.ss15.alr.searchengine.DefaultLuceneController.java

/**
 * Indexes a single document//from  w w  w  .  j  a va2  s.  c  o  m
 */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);

        // Add the last modified date of the file a field named "modified".
        // Use a LongField that is indexed (i.e. efficiently filterable with
        // NumericRangeFilter).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongField("modified", lastModified, Field.Store.NO));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        BufferedReader reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));

        List<String> lines = new ArrayList<>();
        while (reader.ready()) {
            lines.add(reader.readLine());
        }
        if (lines.size() > 0) {
            String urlLine = lines.remove(0);
            if (urlLine != null && urlLine.startsWith("URL:")) {
                urlLine = urlLine.substring(4);
                doc.add(new TextField("URL", urlLine, Field.Store.YES));
            }
        }
        if (lines.size() > 0) {
            String dataType = lines.remove(0);
            if (dataType != null && dataType.startsWith("DataType:")) {
                dataType = dataType.substring(9);
                doc.add(new TextField("DataType", dataType, Field.Store.YES));
            }
        }
        if (lines.size() > 0) {
            String title = lines.remove(0);
            if (title != null && title.startsWith("Title:")) {
                title = title.substring(6);
                doc.add(new TextField("title", title, Field.Store.YES));
            }
        }
        String content = "";
        for (String s : lines) {
            content = content + s;
        }
        doc.add(new TextField("contents", content, Field.Store.NO));

        if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:edu.albany.ir.example.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is
 * given, recurses over files and directories found under the given
 * directory./*from  w  w w  . j  a  v  a 2  s. c o m*/
 * 
 * NOTE: This method indexes one document per input file. This is slow. For
 * good throughput, put multiple documents into your input file(s). An
 * example of this is in the benchmark module, which can create "line doc"
 * files, one document per line, using the <a href=
 * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 * 
 * @param writer
 *            Writer to the index where the given file/dir info will be
 *            stored
 * @param file
 *            The file to index, or the directory to recurse into to find
 *            files to index
 * @throws IOException
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this
                // exception with an "access denied" message
                // checking if the file can be read doesn't help
                fnfe.printStackTrace();
                return;
            }

            try {

                // our code
                // *********************************************************
                String record = null;
                int a, b, stringNum = 0, i = 0;
                // String[] docContents = new String[1000];
                // String[] docNos = new String[10000];
                String docName = null;
                // make a new, empty document
                Document doc = new Document();

                BufferedReader reader = new BufferedReader(new InputStreamReader(fis));

                record = new String();
                while ((record = reader.readLine()) != null) {
                    a = record.lastIndexOf("<DOCNO>");
                    b = record.indexOf("</DOCNO>");

                    if (a >= 0 && b > 0) // if this line contains the DOCNO
                    {
                        stringNum++;
                        // docNos[stringNum] = record.substring(a+7,b-1);
                        docName = record.substring(a + 7, b).trim();
                        // add a document
                        if (stringNum >= 1) {
                            // index previous document
                            if (stringNum >= 2)
                                writer.addDocument(doc);

                            // start new document
                            doc = new Document();
                            // doc.add(new Field("path", file.getPath()+
                            // "/"+docName,
                            // Add the path of the file as a field named
                            // "path". Use a
                            // field that is indexed (i.e. searchable), but
                            // don't tokenize
                            // the field into separate words and don't index
                            // term frequency
                            // or positional information:
                            Field pathField = new Field("path", docName, Field.Store.YES,
                                    Field.Index.NOT_ANALYZED_NO_NORMS);
                            pathField.setOmitTermFreqAndPositions(true);
                            doc.add(pathField);
                            // doc.add(new Field("path", docName,
                            // Field.Store.YES,
                            // Field.Index.UN_TOKENIZED));
                            // System.out.println("adding " +
                            // file.getPath()+ "/"+docName);
                            System.out.println("adding " + docName);

                            // Add the last modified date of the file a
                            // field named "modified".
                            // Use a NumericField that is indexed (i.e.
                            // efficiently filterable with
                            // NumericRangeFilter). This indexes to
                            // milli-second resolution, which
                            // is often too fine. You could instead create a
                            // number based on
                            // year/month/day/hour/minutes/seconds, down the
                            // resolution you require.
                            // For example the long value 2011021714 would
                            // mean
                            // February 17, 2011, 2-3 PM.
                            NumericField modifiedField = new NumericField("modified");
                            modifiedField.setLongValue(file.lastModified());
                            doc.add(modifiedField);

                            // doc.add(new Field("modified",
                            // DateField.timeToString(file.lastModified()),
                            // Field.Store.YES,
                            // Field.Index.UN_TOKENIZED));
                        }
                    } else {

                        doc.add(new Field("contents", record, Field.Store.YES, Field.Index.ANALYZED, // tokenized
                                Field.TermVector.YES));
                        // docContents[stringNum] = docContents[stringNum] +
                        // record;
                        // add contents to document
                        // Add the contents of the file to a field named
                        // "contents". Specify a Reader,
                        // so that the text of the file is tokenized and
                        // indexed, but not stored.
                        // Note that FileReader expects the file to be in
                        // UTF-8 encoding.
                        // If that's not the case searching for special
                        // characters will fail.
                        // doc.add(new Field("contents", new
                        // BufferedReader(new InputStreamReader(fis,
                        // "UTF-8"))));
                    }
                    a = 0;
                    b = 0;
                }

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document can be there):
                    System.out.println("adding " + docName);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been indexed) so
                    // we use updateDocument instead to replace the old one
                    // matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:edu.cmu.cs.in.search.HoopLuceneIndex.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * /*from   w w  w.  ja  v  a 2s  .  c  o  m*/
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();

            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {
            FileInputStream fis;

            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {
                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }
            } finally {
                fis.close();
            }
        }
    }
}

From source file:edu.cmu.lti.huiying.ir.rangedsearch.TableIndexer.java

License:Apache License

public void indexExplodedXml(IndexWriter writer, File file) throws IOException {
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexExplodedXml(writer, new File(file, files[i]));
                }/*from   w w w .  ja  va2  s  .  c o m*/
            }
        } else {
            FileInputStream fis = new FileInputStream(file);
            try {
                NumericFeatureGenerator nfg = new NumericFeatureGenerator();
                if (this.xmlreader == null) {
                    this.xmlreader = new XmlStAXReader();
                }
                Article a = xmlreader.readArticleFromXml(file.getAbsolutePath());
                for (Table t : a.tables) {
                    for (Group g : t.groups) {
                        for (Column col : g.columns) {
                            // index columns
                            Document coldoc = new Document();
                            ArrayList<Double> cfv = nfg.getFeatureVector(col.content);
                            if (cfv.get(0) != null) {
                                DoubleField intratio = new DoubleField("intratio", cfv.get(0), Field.Store.NO);
                                coldoc.add(intratio);
                            }
                            if (cfv.get(1) != null) {
                                DoubleField floatratio = new DoubleField("floatratio", cfv.get(1),
                                        Field.Store.NO);
                                coldoc.add(floatratio);
                            }
                            if (cfv.get(3) != null) {
                                DoubleField mean = new DoubleField("mean", cfv.get(3), Field.Store.NO);
                                coldoc.add(mean);
                            }
                            if (cfv.get(4) != null) {
                                DoubleField std = new DoubleField("std", cfv.get(4), Field.Store.NO);
                                coldoc.add(std);
                            }
                            if (cfv.get(6) != null) {
                                DoubleField min = new DoubleField("min", cfv.get(6), Field.Store.NO);
                                coldoc.add(min);
                            }
                            if (cfv.get(7) != null) {
                                DoubleField max = new DoubleField("max", cfv.get(7), Field.Store.NO);
                                coldoc.add(max);
                            }
                            if (cfv.get(8) != null) {
                                DoubleField acc = new DoubleField("acc", cfv.get(8), Field.Store.NO);
                                coldoc.add(acc);
                            }
                            if (cfv.get(11) != null) {
                                DoubleField colmag = new DoubleField("colmag", cfv.get(11), Field.Store.NO);
                                coldoc.add(colmag);
                            }

                            StringField wholegroup = new StringField("wholegroup", g.toString(),
                                    Field.Store.YES);
                            if (wholegroup.stringValue().getBytes().length > 32760) {
                                wholegroup.setStringValue("Table too large...");
                                System.err.println(
                                        "table too large:" + wholegroup.stringValue().getBytes().length);

                            }
                            String headers = "";
                            if (col.headers != null) {
                                for (Header hdr : col.headers) {
                                    headers += hdr.text.toLowerCase() + " ";
                                }
                            }
                            TextField header = new TextField("headerkeywords", headers.trim(), Field.Store.NO);
                            coldoc.add(header);
                            coldoc.add(wholegroup);
                            StringField fname = new StringField("filename", file.getAbsolutePath(),
                                    Field.Store.YES);
                            coldoc.add(fname);
                            StringField type = new StringField("type", "column", Field.Store.YES);
                            coldoc.add(type);
                            IntField bstart = new IntField("bytestart", col.content.get(0).byteStart,
                                    Field.Store.YES);
                            IntField bend = new IntField("byteend",
                                    col.content.get(col.content.size() - 1).byteEnd, Field.Store.YES);
                            String content = "";
                            for (edu.cmu.lti.huiying.domainclasses.Field f : col.content)
                                content += f.text + "|";
                            StringField colcontent = new StringField("colcontent",
                                    content.substring(0, content.length() - 1), Field.Store.YES);
                            coldoc.add(colcontent);
                            coldoc.add(bstart);
                            coldoc.add(bend);
                            if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                                writer.addDocument(coldoc);
                                totalDocAdded++;
                            } else {
                                writer.updateDocument(new Term("path", file.getPath()), coldoc);
                            }
                            for (edu.cmu.lti.huiying.domainclasses.Field f : col.content) {
                                // Index single cell
                                Document celldoc = new Document();
                                ArrayList<Double> fv = nfg.field2Features(f);
                                if (fv.get(0) == 1 || fv.get(0) == 2) {
                                    try {
                                        DoubleField df = new DoubleField("value", fv.get(1), Field.Store.YES);
                                        celldoc.add(df);
                                        StringField textf = new StringField("text", f.text, Field.Store.YES);
                                        celldoc.add(textf);
                                        if (fv.get(2) != null & fv.get(2) != Double.NaN) {
                                            DoubleField errf = new DoubleField("error", fv.get(2),
                                                    Field.Store.NO);
                                            celldoc.add(errf);
                                        }
                                        if (fv.get(5) != Double.NaN) {
                                            DoubleField magf = new DoubleField("cellmag", fv.get(5),
                                                    Field.Store.NO);
                                            celldoc.add(magf);
                                        }
                                        if (fv.get(4) != null) {
                                            DoubleField pvalue = new DoubleField("cellpvalue", fv.get(4),
                                                    Field.Store.NO);
                                            celldoc.add(pvalue);
                                        }
                                        StringField sf = new StringField("filename", file.getAbsolutePath(),
                                                Field.Store.YES);
                                        celldoc.add(sf);

                                        StringField ctype = new StringField("type", "cell", Field.Store.YES);
                                        celldoc.add(ctype);
                                        //StringField cwholegroup=new StringField("wholegroup", g.toString(), Field.Store.YES);
                                        //celldoc.add(cwholegroup);
                                        IntField cbstart = new IntField("bytestart", f.byteStart,
                                                Field.Store.YES);
                                        IntField cbend = new IntField("byteend", f.byteEnd, Field.Store.YES);
                                        celldoc.add(cbstart);
                                        celldoc.add(cbend);
                                    } catch (NullPointerException e) {
                                        e.printStackTrace();
                                        System.out.println(f.text);
                                    }
                                    if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                                        writer.addDocument(celldoc);
                                        totalDocAdded++;
                                    } else {
                                        writer.updateDocument(new Term("path", file.getPath()), celldoc);
                                    }
                                }
                            }
                        }
                    }
                }

            } finally {
                fis.close();
            }
        }
    }
}