Example usage for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException

Source Link

Document

Adds a document to this index.

Usage

From source file:driver651.Driver651.java

License:Apache License

public static void main(String[] args) throws Exception {
    //@FieldCacheImpl.java
    int threadNo = Integer.parseInt(args[0]);
    // 138 vs 179                 507 (original)  all-threads-one-cache
    // 1295      1779               one-thread-one-cache
    RAMDirectory directory = new RAMDirectory();
    IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
    int theInt = Integer.MAX_VALUE;
    for (int j = 0; j < NUM_FIELDS; j++) {
        for (int i = 0; i < NUM_DOCS; i++) {
            Document doc = new Document();
            doc.add(new Field("theField" + j, String.valueOf(theInt--), Field.Store.NO,
                    Field.Index.UN_TOKENIZED));// notice the field "theFieldj"
            writer.addDocument(doc);
        }/*w  w w. j  a v a2s .  c  o m*/
    }

    writer.close();
    reader = IndexReader.open(directory);

    FieldCacheImpl cache = new FieldCacheImpl();// move it out of the loop, then you get the all-threads-one-cache scenario!

    WorkerThread[] workers = new WorkerThread[threadNo];
    for (int i = 0; i < threadNo; i++) {

        workers[i] = new WorkerThread(cache);
    }
    long start = System.currentTimeMillis();
    for (int i = 0; i < threadNo; i++) {
        workers[i].start();
    }

    for (int i = 0; i < threadNo; i++) {
        workers[i].join();
    }

    long end = System.currentTimeMillis();
    System.out.println("duration: " + (end - start));
}

From source file:dynamicrefactoring.interfaz.wizard.search.internal.SearchableTypeIndexer.java

License:Open Source License

/**
 * Genera un indice en el directorio pasado.
 * /*from  w  w  w .  jav  a 2  s. c  om*/
 * @param elementType
 *            tipo de searchable cuyos elementos se van a indizar
 * @param directory
 *            directorio sobre el que se generara el indice
 * 
 * @return numero de elementos indizados
 * @throws CorruptIndexException
 * 
 * @throws IOException
 */
@Override
public int index(SearchableType elementType, Directory directory) throws IOException {
    final IndexWriter writer = createWriter(directory);
    int numIndexed = 0;
    JavadocReader javadocReader = EclipseBasedJavadocReader.INSTANCE;
    for (String fullyQualifiedName : elementType.getClassesToIndex()) {
        String text = javadocReader.getTypeJavaDocAsPlainText(fullyQualifiedName);
        Document doc = getDocument(fullyQualifiedName, text);
        writer.addDocument(doc);
        numIndexed++;
    }
    close(writer);
    return numIndexed;
}

From source file:edu.albany.ir.example.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is
 * given, recurses over files and directories found under the given
 * directory.//  w  ww .j a va 2  s.co m
 * 
 * NOTE: This method indexes one document per input file. This is slow. For
 * good throughput, put multiple documents into your input file(s). An
 * example of this is in the benchmark module, which can create "line doc"
 * files, one document per line, using the <a href=
 * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 * 
 * @param writer
 *            Writer to the index where the given file/dir info will be
 *            stored
 * @param file
 *            The file to index, or the directory to recurse into to find
 *            files to index
 * @throws IOException
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this
                // exception with an "access denied" message
                // checking if the file can be read doesn't help
                fnfe.printStackTrace();
                return;
            }

            try {

                // our code
                // *********************************************************
                String record = null;
                int a, b, stringNum = 0, i = 0;
                // String[] docContents = new String[1000];
                // String[] docNos = new String[10000];
                String docName = null;
                // make a new, empty document
                Document doc = new Document();

                BufferedReader reader = new BufferedReader(new InputStreamReader(fis));

                record = new String();
                while ((record = reader.readLine()) != null) {
                    a = record.lastIndexOf("<DOCNO>");
                    b = record.indexOf("</DOCNO>");

                    if (a >= 0 && b > 0) // if this line contains the DOCNO
                    {
                        stringNum++;
                        // docNos[stringNum] = record.substring(a+7,b-1);
                        docName = record.substring(a + 7, b).trim();
                        // add a document
                        if (stringNum >= 1) {
                            // index previous document
                            if (stringNum >= 2)
                                writer.addDocument(doc);

                            // start new document
                            doc = new Document();
                            // doc.add(new Field("path", file.getPath()+
                            // "/"+docName,
                            // Add the path of the file as a field named
                            // "path". Use a
                            // field that is indexed (i.e. searchable), but
                            // don't tokenize
                            // the field into separate words and don't index
                            // term frequency
                            // or positional information:
                            Field pathField = new Field("path", docName, Field.Store.YES,
                                    Field.Index.NOT_ANALYZED_NO_NORMS);
                            pathField.setOmitTermFreqAndPositions(true);
                            doc.add(pathField);
                            // doc.add(new Field("path", docName,
                            // Field.Store.YES,
                            // Field.Index.UN_TOKENIZED));
                            // System.out.println("adding " +
                            // file.getPath()+ "/"+docName);
                            System.out.println("adding " + docName);

                            // Add the last modified date of the file a
                            // field named "modified".
                            // Use a NumericField that is indexed (i.e.
                            // efficiently filterable with
                            // NumericRangeFilter). This indexes to
                            // milli-second resolution, which
                            // is often too fine. You could instead create a
                            // number based on
                            // year/month/day/hour/minutes/seconds, down the
                            // resolution you require.
                            // For example the long value 2011021714 would
                            // mean
                            // February 17, 2011, 2-3 PM.
                            NumericField modifiedField = new NumericField("modified");
                            modifiedField.setLongValue(file.lastModified());
                            doc.add(modifiedField);

                            // doc.add(new Field("modified",
                            // DateField.timeToString(file.lastModified()),
                            // Field.Store.YES,
                            // Field.Index.UN_TOKENIZED));
                        }
                    } else {

                        doc.add(new Field("contents", record, Field.Store.YES, Field.Index.ANALYZED, // tokenized
                                Field.TermVector.YES));
                        // docContents[stringNum] = docContents[stringNum] +
                        // record;
                        // add contents to document
                        // Add the contents of the file to a field named
                        // "contents". Specify a Reader,
                        // so that the text of the file is tokenized and
                        // indexed, but not stored.
                        // Note that FileReader expects the file to be in
                        // UTF-8 encoding.
                        // If that's not the case searching for special
                        // characters will fail.
                        // doc.add(new Field("contents", new
                        // BufferedReader(new InputStreamReader(fis,
                        // "UTF-8"))));
                    }
                    a = 0;
                    b = 0;
                }

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document can be there):
                    System.out.println("adding " + docName);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been indexed) so
                    // we use updateDocument instead to replace the old one
                    // matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:edu.cmu.cs.in.search.HoopLuceneIndex.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * //w  w w .  j a  v a  2 s  .  c om
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();

            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {
            FileInputStream fis;

            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {
                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }
            } finally {
                fis.close();
            }
        }
    }
}

From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazInfoIndexerAllCountries.java

License:Apache License

void indexGazatteer(BufferedReader br, IndexWriter iw) throws IOException {

    Document d = new Document();
    StringField nfid = new StringField("ID", "0", Field.Store.YES);
    StringField name = new StringField("ORIGINAL-NAME", "", Field.Store.YES);
    IntField nfaltnames = new IntField("ALTNAME-COUNT", 0, Field.Store.YES);
    DoubleField nflong = new DoubleField("LONGTITUDE", 0.0, Field.Store.YES);
    DoubleField nfla = new DoubleField("LATITUDE", 0.0, Field.Store.YES);
    LongField nfpop = new LongField("POPULATION", 0, Field.Store.YES);
    StringField sfcountry = new StringField("COUNTRY-CODE", "", Field.Store.YES);
    StringField sfadm1 = new StringField("ADM1-CODE", "", Field.Store.YES);
    StringField sfadm2 = new StringField("ADM2-CODE", "", Field.Store.YES);
    StringField sfadm3 = new StringField("ADM3-CODE", "", Field.Store.YES);
    StringField sfadm4 = new StringField("ADM4-CODE", "", Field.Store.YES);
    StringField sffeatureclass = new StringField("FEATURE-CLASS", "", Field.Store.YES);
    StringField sffeature = new StringField("FEATURE", "", Field.Store.YES);
    StringField sftimezone = new StringField("TIMEZONE", "", Field.Store.YES);
    d.add(nfid);/*from   w  ww .j a  v  a  2 s  .co m*/
    d.add(name);
    d.add(nfaltnames);
    d.add(nflong);
    d.add(nfla);
    d.add(nfpop);
    d.add(sfcountry);
    d.add(sfadm1);
    d.add(sfadm2);
    d.add(sfadm3);
    d.add(sfadm4);
    d.add(sffeatureclass);
    d.add(sffeature);
    d.add(sftimezone);

    String line;
    int linen = 0;
    while ((line = br.readLine()) != null) {
        if (linen++ % 10000 == 0)
            System.out.println(linen + "\n" + line);
        String[] column = line.trim().split("\t");

        // get other columns except for the location words
        String id = column[0];
        String utfname = column[1];
        String altnames = column[3];
        String latitude = column[4];
        String longtitude = column[5];
        double dlong, dla;
        if (latitude == null) {
            dlong = 999;
            dla = 999;
        } else {
            dlong = Double.parseDouble(longtitude);
            dla = Double.parseDouble(latitude);
        }
        String featureclass = column[6];
        String feature = column[7];
        String country = column[8];
        String population = column[14];
        long longpop;
        if (population == null)
            longpop = -1;
        longpop = Long.parseLong(population);
        String timezone = column[17];

        // To Do: set values to document d, and index it
        nfid.setStringValue(id);// 1
        name.setStringValue(utfname);
        nfaltnames.setIntValue(altnames.split(",").length);
        nflong.setDoubleValue(dlong);
        nfla.setDoubleValue(dla);
        nfpop.setLongValue(longpop);

        sfcountry.setStringValue(country.toLowerCase());
        sfadm1.setStringValue(column[10].toLowerCase());
        sfadm2.setStringValue(column[11].toLowerCase());
        sfadm3.setStringValue(column[12].toLowerCase());
        sfadm4.setStringValue(column[13].toLowerCase());
        sffeatureclass.setStringValue(featureclass);
        sffeature.setStringValue(feature);
        sftimezone.setStringValue(timezone);// 13

        // add this new document.
        iw.addDocument(d);
    }
}

From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazStringIndexerAllCountries.java

License:Apache License

void indexGazatteer(BufferedReader br, IndexWriter iw) throws IOException, InterruptedException {

    Document d = new Document();
    StringField nfid = new StringField("ID", "", Field.Store.YES);
    StringField sforigin = new StringField("LOWERED_ORIGIN", "", Field.Store.YES);
    StringField normnws = new StringField("LOWERED-NO-WS", "", Field.Store.YES);
    TextField sfunigram = new TextField("UNIGRAM", "", Field.Store.YES);
    TextField sfbigram = new TextField("BIGRAM", "", Field.Store.YES);
    TextField sftrigram = new TextField("TRIGRAM", "", Field.Store.YES);
    StringField lang = new StringField("LANG", "", Field.Store.YES);
    d.add(nfid);// w  w w  . ja v  a 2s .  co m
    d.add(sforigin);
    d.add(normnws);
    d.add(sfunigram);
    d.add(sfbigram);
    d.add(sftrigram);

    String line;
    String[] column;
    String id, phrase, otherlang, oneOtherlang;
    int otherlangLength;
    int linen = 0;
    while ((line = br.readLine()) != null) {
        if (linen++ % 10000 == 0)
            System.out.println(linen + "\n" + line);
        column = line.trim().split("\t");

        // get other columns except for the location words
        id = column[0];
        phrase = column[1];
        otherlang = column[3];

        // To Do: set values to document d, and index it
        nfid.setStringValue(id);// 1
        // id does not change, change the strings to index.
        phrase = phrase.toLowerCase();
        sforigin.setStringValue(phrase);// 5
        normnws.setStringValue(phrase.replaceAll(" ", ""));

        getIndexFeatures(phrase);

        sfunigram.setStringValue(getUnigram());
        sfbigram.setStringValue(getBigram());
        sftrigram.setStringValue(getTrigram());

        lang.setStringValue("");
        // add this new document.
        iw.addDocument(d);

        if (otherlang.length() == 0)
            continue;
        String otherlangs[] = otherlang.split(",");
        otherlangLength = otherlangs.length;
        for (int i = 0; i < otherlangLength; i++) {
            if (otherlangs[i].length() == 0)
                continue;
            // id does not change, change the strings to index.
            oneOtherlang = otherlangs[i].toLowerCase();
            sforigin.setStringValue(oneOtherlang);// 5
            normnws.setStringValue(oneOtherlang.replaceAll(" ", ""));

            getIndexFeatures(oneOtherlang);

            sfunigram.setStringValue(getUnigram());
            sfbigram.setStringValue(getBigram());
            sftrigram.setStringValue(getTrigram());

            lang.setStringValue("");
            // add this new document.
            iw.addDocument(d);
        }
    }
}

From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazStringIndexerAltNames.java

License:Apache License

void indexAlterNames(BufferedReader br, IndexWriter iw) throws IOException, InterruptedException {

    Document d = new Document();
    StringField nfid = new StringField("ID", "", Field.Store.YES);
    StringField sforigin = new StringField("LOWERED_ORIGIN", "", Field.Store.YES);
    StringField normnws = new StringField("LOWERED-NO-WS", "", Field.Store.YES);
    TextField sfunigram = new TextField("UNIGRAM", "", Field.Store.YES);
    TextField sfbigram = new TextField("BIGRAM", "", Field.Store.YES);
    TextField sftrigram = new TextField("TRIGRAM", "", Field.Store.YES);
    StringField lang = new StringField("LANG", "", Field.Store.YES);
    d.add(nfid);//w w w  . jav  a  2s . c  o  m
    d.add(sforigin);
    d.add(normnws);
    d.add(sfunigram);
    d.add(sfbigram);
    d.add(sftrigram);

    String line;
    String[] column;
    String id, langOrLink, phrase;
    int linen = 0;
    while ((line = br.readLine()) != null) {
        if (linen++ % 10000 == 0)
            System.out.println(linen + "\n" + line);
        column = line.trim().split("\t");

        // get other columns except for the location words
        id = column[1];
        langOrLink = column[2];
        phrase = column[3];

        // To Do: set values to document d, and index it
        nfid.setStringValue(id);// 1

        phrase = phrase.toLowerCase();

        sforigin.setStringValue(phrase);// 5
        normnws.setStringValue(phrase.replaceAll(" ", ""));

        getIndexFeatures(phrase);

        sfunigram.setStringValue(getUnigram());
        sfbigram.setStringValue(getBigram());
        sftrigram.setStringValue(getTrigram());

        lang.setStringValue(langOrLink);
        // add this new document.
        iw.addDocument(d);
    }
}

From source file:edu.cmu.lti.huiying.ir.rangedsearch.TableIndexer.java

License:Apache License

public void indexExplodedXml(IndexWriter writer, File file) throws IOException {
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexExplodedXml(writer, new File(file, files[i]));
                }/*from   ww  w  .  j a  va2s .  co  m*/
            }
        } else {
            FileInputStream fis = new FileInputStream(file);
            try {
                NumericFeatureGenerator nfg = new NumericFeatureGenerator();
                if (this.xmlreader == null) {
                    this.xmlreader = new XmlStAXReader();
                }
                Article a = xmlreader.readArticleFromXml(file.getAbsolutePath());
                for (Table t : a.tables) {
                    for (Group g : t.groups) {
                        for (Column col : g.columns) {
                            // index columns
                            Document coldoc = new Document();
                            ArrayList<Double> cfv = nfg.getFeatureVector(col.content);
                            if (cfv.get(0) != null) {
                                DoubleField intratio = new DoubleField("intratio", cfv.get(0), Field.Store.NO);
                                coldoc.add(intratio);
                            }
                            if (cfv.get(1) != null) {
                                DoubleField floatratio = new DoubleField("floatratio", cfv.get(1),
                                        Field.Store.NO);
                                coldoc.add(floatratio);
                            }
                            if (cfv.get(3) != null) {
                                DoubleField mean = new DoubleField("mean", cfv.get(3), Field.Store.NO);
                                coldoc.add(mean);
                            }
                            if (cfv.get(4) != null) {
                                DoubleField std = new DoubleField("std", cfv.get(4), Field.Store.NO);
                                coldoc.add(std);
                            }
                            if (cfv.get(6) != null) {
                                DoubleField min = new DoubleField("min", cfv.get(6), Field.Store.NO);
                                coldoc.add(min);
                            }
                            if (cfv.get(7) != null) {
                                DoubleField max = new DoubleField("max", cfv.get(7), Field.Store.NO);
                                coldoc.add(max);
                            }
                            if (cfv.get(8) != null) {
                                DoubleField acc = new DoubleField("acc", cfv.get(8), Field.Store.NO);
                                coldoc.add(acc);
                            }
                            if (cfv.get(11) != null) {
                                DoubleField colmag = new DoubleField("colmag", cfv.get(11), Field.Store.NO);
                                coldoc.add(colmag);
                            }

                            StringField wholegroup = new StringField("wholegroup", g.toString(),
                                    Field.Store.YES);
                            if (wholegroup.stringValue().getBytes().length > 32760) {
                                wholegroup.setStringValue("Table too large...");
                                System.err.println(
                                        "table too large:" + wholegroup.stringValue().getBytes().length);

                            }
                            String headers = "";
                            if (col.headers != null) {
                                for (Header hdr : col.headers) {
                                    headers += hdr.text.toLowerCase() + " ";
                                }
                            }
                            TextField header = new TextField("headerkeywords", headers.trim(), Field.Store.NO);
                            coldoc.add(header);
                            coldoc.add(wholegroup);
                            StringField fname = new StringField("filename", file.getAbsolutePath(),
                                    Field.Store.YES);
                            coldoc.add(fname);
                            StringField type = new StringField("type", "column", Field.Store.YES);
                            coldoc.add(type);
                            IntField bstart = new IntField("bytestart", col.content.get(0).byteStart,
                                    Field.Store.YES);
                            IntField bend = new IntField("byteend",
                                    col.content.get(col.content.size() - 1).byteEnd, Field.Store.YES);
                            String content = "";
                            for (edu.cmu.lti.huiying.domainclasses.Field f : col.content)
                                content += f.text + "|";
                            StringField colcontent = new StringField("colcontent",
                                    content.substring(0, content.length() - 1), Field.Store.YES);
                            coldoc.add(colcontent);
                            coldoc.add(bstart);
                            coldoc.add(bend);
                            if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                                writer.addDocument(coldoc);
                                totalDocAdded++;
                            } else {
                                writer.updateDocument(new Term("path", file.getPath()), coldoc);
                            }
                            for (edu.cmu.lti.huiying.domainclasses.Field f : col.content) {
                                // Index single cell
                                Document celldoc = new Document();
                                ArrayList<Double> fv = nfg.field2Features(f);
                                if (fv.get(0) == 1 || fv.get(0) == 2) {
                                    try {
                                        DoubleField df = new DoubleField("value", fv.get(1), Field.Store.YES);
                                        celldoc.add(df);
                                        StringField textf = new StringField("text", f.text, Field.Store.YES);
                                        celldoc.add(textf);
                                        if (fv.get(2) != null & fv.get(2) != Double.NaN) {
                                            DoubleField errf = new DoubleField("error", fv.get(2),
                                                    Field.Store.NO);
                                            celldoc.add(errf);
                                        }
                                        if (fv.get(5) != Double.NaN) {
                                            DoubleField magf = new DoubleField("cellmag", fv.get(5),
                                                    Field.Store.NO);
                                            celldoc.add(magf);
                                        }
                                        if (fv.get(4) != null) {
                                            DoubleField pvalue = new DoubleField("cellpvalue", fv.get(4),
                                                    Field.Store.NO);
                                            celldoc.add(pvalue);
                                        }
                                        StringField sf = new StringField("filename", file.getAbsolutePath(),
                                                Field.Store.YES);
                                        celldoc.add(sf);

                                        StringField ctype = new StringField("type", "cell", Field.Store.YES);
                                        celldoc.add(ctype);
                                        //StringField cwholegroup=new StringField("wholegroup", g.toString(), Field.Store.YES);
                                        //celldoc.add(cwholegroup);
                                        IntField cbstart = new IntField("bytestart", f.byteStart,
                                                Field.Store.YES);
                                        IntField cbend = new IntField("byteend", f.byteEnd, Field.Store.YES);
                                        celldoc.add(cbstart);
                                        celldoc.add(cbend);
                                    } catch (NullPointerException e) {
                                        e.printStackTrace();
                                        System.out.println(f.text);
                                    }
                                    if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                                        writer.addDocument(celldoc);
                                        totalDocAdded++;
                                    } else {
                                        writer.updateDocument(new Term("path", file.getPath()), celldoc);
                                    }
                                }
                            }
                        }
                    }
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:edu.cmu.lti.huiying.ir.rangedsearch.TableIndexer.java

License:Apache License

public void indexOffsetAnnotation(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    if (files[i].equals("NeuroScience.num.offset"))
                        indexOffsetAnnotation(writer, new File(file, files[i]));
                }//from w  w  w  .j  av  a 2 s .c o m
            }
        } else {
            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();
                BufferedReader br = new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8));
                String line = null;
                String filename = null;
                while ((line = br.readLine()) != null) {
                    if (line.trim().length() == 0) {
                        doc.add((new StringField("filename", filename, Field.Store.YES)));
                        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                            writer.addDocument(doc);
                        } else {
                            System.out.println("updating " + file);
                            writer.updateDocument(new Term("path", file.getPath()), doc);
                        }
                        doc = new Document();
                        filename = null;
                        continue;
                    }
                    String[] spl = line.split("\t");
                    doc.add(new DoubleField(spl[3], Double.parseDouble(spl[5]), Field.Store.YES));
                    if (filename == null)
                        filename = spl[0];
                }
                br.close();
            } finally {
                fis.close();
            }
        }
    }
}

From source file:edu.cmu.lti.oaqa.knn4qa.apps.LuceneIndexer.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();

    options.addOption(CommonParams.ROOT_DIR_PARAM, null, true, CommonParams.ROOT_DIR_DESC);
    options.addOption(CommonParams.SUB_DIR_TYPE_PARAM, null, true, CommonParams.SUB_DIR_TYPE_DESC);
    options.addOption(CommonParams.MAX_NUM_REC_PARAM, null, true, CommonParams.MAX_NUM_REC_DESC);
    options.addOption(CommonParams.SOLR_FILE_NAME_PARAM, null, true, CommonParams.SOLR_FILE_NAME_DESC);
    options.addOption(CommonParams.OUT_INDEX_PARAM, null, true, CommonParams.OUT_MINDEX_DESC);

    CommandLineParser parser = new org.apache.commons.cli.GnuParser();

    try {/*ww w .  j  a v a  2 s.c  om*/
        CommandLine cmd = parser.parse(options, args);

        String rootDir = null;

        rootDir = cmd.getOptionValue(CommonParams.ROOT_DIR_PARAM);

        if (null == rootDir)
            Usage("Specify: " + CommonParams.ROOT_DIR_DESC, options);

        String outputDirName = cmd.getOptionValue(CommonParams.OUT_INDEX_PARAM);

        if (null == outputDirName)
            Usage("Specify: " + CommonParams.OUT_MINDEX_DESC, options);

        String subDirTypeList = cmd.getOptionValue(CommonParams.SUB_DIR_TYPE_PARAM);

        if (null == subDirTypeList || subDirTypeList.isEmpty())
            Usage("Specify: " + CommonParams.SUB_DIR_TYPE_DESC, options);

        String solrFileName = cmd.getOptionValue(CommonParams.SOLR_FILE_NAME_PARAM);
        if (null == solrFileName)
            Usage("Specify: " + CommonParams.SOLR_FILE_NAME_DESC, options);

        int maxNumRec = Integer.MAX_VALUE;

        String tmp = cmd.getOptionValue(CommonParams.MAX_NUM_REC_PARAM);

        if (tmp != null) {
            try {
                maxNumRec = Integer.parseInt(tmp);
                if (maxNumRec <= 0) {
                    Usage("The maximum number of records should be a positive integer", options);
                }
            } catch (NumberFormatException e) {
                Usage("The maximum number of records should be a positive integer", options);
            }
        }

        File outputDir = new File(outputDirName);
        if (!outputDir.exists()) {
            if (!outputDir.mkdirs()) {
                System.out.println("couldn't create " + outputDir.getAbsolutePath());
                System.exit(1);
            }
        }
        if (!outputDir.isDirectory()) {
            System.out.println(outputDir.getAbsolutePath() + " is not a directory!");
            System.exit(1);
        }
        if (!outputDir.canWrite()) {
            System.out.println("Can't write to " + outputDir.getAbsolutePath());
            System.exit(1);
        }

        String subDirs[] = subDirTypeList.split(",");

        int docNum = 0;

        // No English analyzer here, all language-related processing is done already,
        // here we simply white-space tokenize and index tokens verbatim.
        Analyzer analyzer = new WhitespaceAnalyzer();
        FSDirectory indexDir = FSDirectory.open(outputDir);
        IndexWriterConfig indexConf = new IndexWriterConfig(analyzer.getVersion(), analyzer);

        System.out.println("Creating a new Lucene index, maximum # of docs to process: " + maxNumRec);
        indexConf.setOpenMode(OpenMode.CREATE);
        IndexWriter indexWriter = new IndexWriter(indexDir, indexConf);

        for (int subDirId = 0; subDirId < subDirs.length && docNum < maxNumRec; ++subDirId) {
            String inputFileName = rootDir + "/" + subDirs[subDirId] + "/" + solrFileName;

            System.out.println("Input file name: " + inputFileName);

            BufferedReader inpText = new BufferedReader(
                    new InputStreamReader(CompressUtils.createInputStream(inputFileName)));
            String docText = XmlHelper.readNextXMLIndexEntry(inpText);

            for (; docText != null && docNum < maxNumRec; docText = XmlHelper.readNextXMLIndexEntry(inpText)) {
                ++docNum;
                Map<String, String> docFields = null;

                Document luceneDoc = new Document();

                try {
                    docFields = XmlHelper.parseXMLIndexEntry(docText);
                } catch (Exception e) {
                    System.err.println(String.format("Parsing error, offending DOC #%d:\n%s", docNum, docText));
                    System.exit(1);
                }

                String id = docFields.get(UtilConst.TAG_DOCNO);

                if (id == null) {
                    System.err.println(String.format("No ID tag '%s', offending DOC #%d:\n%s",
                            UtilConst.TAG_DOCNO, docNum, docText));
                }

                luceneDoc.add(new StringField(UtilConst.TAG_DOCNO, id, Field.Store.YES));

                for (Map.Entry<String, String> e : docFields.entrySet())
                    if (!e.getKey().equals(UtilConst.TAG_DOCNO)) {
                        luceneDoc.add(new TextField(e.getKey(), e.getValue(), Field.Store.YES));
                    }
                indexWriter.addDocument(luceneDoc);
                if (docNum % 1000 == 0)
                    System.out.println("Indexed " + docNum + " docs");
            }
            System.out.println("Indexed " + docNum + " docs");
        }

        indexWriter.commit();
        indexWriter.close();

    } catch (ParseException e) {
        Usage("Cannot parse arguments", options);
    } catch (Exception e) {
        System.err.println("Terminating due to an exception: " + e);
        System.exit(1);
    }

}