Example usage for org.apache.lucene.index IndexWriter addDocument

List of usage examples for org.apache.lucene.index IndexWriter addDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter addDocument.

Prototype

public long addDocument(Iterable<? extends IndexableField> doc) throws IOException 

Source Link

Document

Adds a document to this index.

Usage

From source file:driver651.Driver651.java

License:Apache License

public static void main(String[] args) throws Exception {
    //@FieldCacheImpl.java
    int threadNo = Integer.parseInt(args[0]);
    // 138 vs 179                 507 (original)  all-threads-one-cache
    // 1295      1779               one-thread-one-cache
    RAMDirectory directory = new RAMDirectory();
    IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true);
    int theInt = Integer.MAX_VALUE;
    for (int j = 0; j < NUM_FIELDS; j++) {
        for (int i = 0; i < NUM_DOCS; i++) {
            Document doc = new Document();
            doc.add(new Field("theField" + j, String.valueOf(theInt--), Field.Store.NO,
                    Field.Index.UN_TOKENIZED));// notice the field "theFieldj"
            writer.addDocument(doc);
        }/*w  w w. j  a v a2s .  c  o m*/
    }

    writer.close();
    reader = IndexReader.open(directory);

    FieldCacheImpl cache = new FieldCacheImpl();// move it out of the loop, then you get the all-threads-one-cache scenario!

    WorkerThread[] workers = new WorkerThread[threadNo];
    for (int i = 0; i < threadNo; i++) {

        workers[i] = new WorkerThread(cache);
    }
    long start = System.currentTimeMillis();
    for (int i = 0; i < threadNo; i++) {
        workers[i].start();
    }

    for (int i = 0; i < threadNo; i++) {
        workers[i].join();
    }

    long end = System.currentTimeMillis();
    System.out.println("duration: " + (end - start));
}

From source file:dynamicrefactoring.interfaz.wizard.search.internal.SearchableTypeIndexer.java

License:Open Source License

/**
 * Genera un indice en el directorio pasado.
 * /*from  w  w  w .  jav  a 2  s. c  om*/
 * @param elementType
 *            tipo de searchable cuyos elementos se van a indizar
 * @param directory
 *            directorio sobre el que se generara el indice
 * 
 * @return numero de elementos indizados
 * @throws CorruptIndexException
 * 
 * @throws IOException
 */
@Override
public int index(SearchableType elementType, Directory directory) throws IOException {
    final IndexWriter writer = createWriter(directory);
    int numIndexed = 0;
    JavadocReader javadocReader = EclipseBasedJavadocReader.INSTANCE;
    for (String fullyQualifiedName : elementType.getClassesToIndex()) {
        String text = javadocReader.getTypeJavaDocAsPlainText(fullyQualifiedName);
        Document doc = getDocument(fullyQualifiedName, text);
        writer.addDocument(doc);
        numIndexed++;
    }
    close(writer);
    return numIndexed;
}

From source file:edu.albany.ir.example.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is
 * given, recurses over files and directories found under the given
 * directory.//  w  ww .j a va 2  s.co m
 * 
 * NOTE: This method indexes one document per input file. This is slow. For
 * good throughput, put multiple documents into your input file(s). An
 * example of this is in the benchmark module, which can create "line doc"
 * files, one document per line, using the <a href=
 * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 * 
 * @param writer
 *            Writer to the index where the given file/dir info will be
 *            stored
 * @param file
 *            The file to index, or the directory to recurse into to find
 *            files to index
 * @throws IOException
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this
                // exception with an "access denied" message
                // checking if the file can be read doesn't help
                fnfe.printStackTrace();
                return;
            }

            try {

                // our code
                // *********************************************************
                String record = null;
                int a, b, stringNum = 0, i = 0;
                // String[] docContents = new String[1000];
                // String[] docNos = new String[10000];
                String docName = null;
                // make a new, empty document
                Document doc = new Document();

                BufferedReader reader = new BufferedReader(new InputStreamReader(fis));

                record = new String();
                while ((record = reader.readLine()) != null) {
                    a = record.lastIndexOf("<DOCNO>");
                    b = record.indexOf("</DOCNO>");

                    if (a >= 0 && b > 0) // if this line contains the DOCNO
                    {
                        stringNum++;
                        // docNos[stringNum] = record.substring(a+7,b-1);
                        docName = record.substring(a + 7, b).trim();
                        // add a document
                        if (stringNum >= 1) {
                            // index previous document
                            if (stringNum >= 2)
                                writer.addDocument(doc);

                            // start new document
                            doc = new Document();
                            // doc.add(new Field("path", file.getPath()+
                            // "/"+docName,
                            // Add the path of the file as a field named
                            // "path". Use a
                            // field that is indexed (i.e. searchable), but
                            // don't tokenize
                            // the field into separate words and don't index
                            // term frequency
                            // or positional information:
                            Field pathField = new Field("path", docName, Field.Store.YES,
                                    Field.Index.NOT_ANALYZED_NO_NORMS);
                            pathField.setOmitTermFreqAndPositions(true);
                            doc.add(pathField);
                            // doc.add(new Field("path", docName,
                            // Field.Store.YES,
                            // Field.Index.UN_TOKENIZED));
                            // System.out.println("adding " +
                            // file.getPath()+ "/"+docName);
                            System.out.println("adding " + docName);

                            // Add the last modified date of the file a
                            // field named "modified".
                            // Use a NumericField that is indexed (i.e.
                            // efficiently filterable with
                            // NumericRangeFilter). This indexes to
                            // milli-second resolution, which
                            // is often too fine. You could instead create a
                            // number based on
                            // year/month/day/hour/minutes/seconds, down the
                            // resolution you require.
                            // For example the long value 2011021714 would
                            // mean
                            // February 17, 2011, 2-3 PM.
                            NumericField modifiedField = new NumericField("modified");
                            modifiedField.setLongValue(file.lastModified());
                            doc.add(modifiedField);

                            // doc.add(new Field("modified",
                            // DateField.timeToString(file.lastModified()),
                            // Field.Store.YES,
                            // Field.Index.UN_TOKENIZED));
                        }
                    } else {

                        doc.add(new Field("contents", record, Field.Store.YES, Field.Index.ANALYZED, // tokenized
                                Field.TermVector.YES));
                        // docContents[stringNum] = docContents[stringNum] +
                        // record;
                        // add contents to document
                        // Add the contents of the file to a field named
                        // "contents". Specify a Reader,
                        // so that the text of the file is tokenized and
                        // indexed, but not stored.
                        // Note that FileReader expects the file to be in
                        // UTF-8 encoding.
                        // If that's not the case searching for special
                        // characters will fail.
                        // doc.add(new Field("contents", new
                        // BufferedReader(new InputStreamReader(fis,
                        // "UTF-8"))));
                    }
                    a = 0;
                    b = 0;
                }

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document can be there):
                    System.out.println("adding " + docName);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been indexed) so
                    // we use updateDocument instead to replace the old one
                    // matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:edu.cmu.cs.in.search.HoopLuceneIndex.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * //w  w w .  j a  v a  2 s  .  c om
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();

            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {
            FileInputStream fis;

            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {
                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }
            } finally {
                fis.close();
            }
        }
    }
}

From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazInfoIndexerAllCountries.java

License:Apache License

void indexGazatteer(BufferedReader br, IndexWriter iw) throws IOException {

    Document d = new Document();
    StringField nfid = new StringField("ID", "0", Field.Store.YES);
    StringField name = new StringField("ORIGINAL-NAME", "", Field.Store.YES);
    IntField nfaltnames = new IntField("ALTNAME-COUNT", 0, Field.Store.YES);
    DoubleField nflong = new DoubleField("LONGTITUDE", 0.0, Field.Store.YES);
    DoubleField nfla = new DoubleField("LATITUDE", 0.0, Field.Store.YES);
    LongField nfpop = new LongField("POPULATION", 0, Field.Store.YES);
    StringField sfcountry = new StringField("COUNTRY-CODE", "", Field.Store.YES);
    StringField sfadm1 = new StringField("ADM1-CODE", "", Field.Store.YES);
    StringField sfadm2 = new StringField("ADM2-CODE", "", Field.Store.YES);
    StringField sfadm3 = new StringField("ADM3-CODE", "", Field.Store.YES);
    StringField sfadm4 = new StringField("ADM4-CODE", "", Field.Store.YES);
    StringField sffeatureclass = new StringField("FEATURE-CLASS", "", Field.Store.YES);
    StringField sffeature = new StringField("FEATURE", "", Field.Store.YES);
    StringField sftimezone = new StringField("TIMEZONE", "", Field.Store.YES);
    d.add(nfid);/*from   w  ww .j a  v  a  2 s  .co m*/
    d.add(name);
    d.add(nfaltnames);
    d.add(nflong);
    d.add(nfla);
    d.add(nfpop);
    d.add(sfcountry);
    d.add(sfadm1);
    d.add(sfadm2);
    d.add(sfadm3);
    d.add(sfadm4);
    d.add(sffeatureclass);
    d.add(sffeature);
    d.add(sftimezone);

    String line;
    int linen = 0;
    while ((line = br.readLine()) != null) {
        if (linen++ % 10000 == 0)
            System.out.println(linen + "\n" + line);
        String[] column = line.trim().split("\t");

        // get other columns except for the location words
        String id = column[0];
        String utfname = column[1];
        String altnames = column[3];
        String latitude = column[4];
        String longtitude = column[5];
        double dlong, dla;
        if (latitude == null) {
            dlong = 999;
            dla = 999;
        } else {
            dlong = Double.parseDouble(longtitude);
            dla = Double.parseDouble(latitude);
        }
        String featureclass = column[6];
        String feature = column[7];
        String country = column[8];
        String population = column[14];
        long longpop;
        if (population == null)
            longpop = -1;
        longpop = Long.parseLong(population);
        String timezone = column[17];

        // To Do: set values to document d, and index it
        nfid.setStringValue(id);// 1
        name.setStringValue(utfname);
        nfaltnames.setIntValue(altnames.split(",").length);
        nflong.setDoubleValue(dlong);
        nfla.setDoubleValue(dla);
        nfpop.setLongValue(longpop);

        sfcountry.setStringValue(country.toLowerCase());
        sfadm1.setStringValue(column[10].toLowerCase());
        sfadm2.setStringValue(column[11].toLowerCase());
        sfadm3.setStringValue(column[12].toLowerCase());
        sfadm4.setStringValue(column[13].toLowerCase());
        sffeatureclass.setStringValue(featureclass);
        sffeature.setStringValue(feature);
        sftimezone.setStringValue(timezone);// 13

        // add this new document.
        iw.addDocument(d);
    }
}

From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazStringIndexerAllCountries.java

License:Apache License

void indexGazatteer(BufferedReader br, IndexWriter iw) throws IOException, InterruptedException {

    Document d = new Document();
    StringField nfid = new StringField("ID", "", Field.Store.YES);
    StringField sforigin = new StringField("LOWERED_ORIGIN", "", Field.Store.YES);
    StringField normnws = new StringField("LOWERED-NO-WS", "", Field.Store.YES);
    TextField sfunigram = new TextField("UNIGRAM", "", Field.Store.YES);
    TextField sfbigram = new TextField("BIGRAM", "", Field.Store.YES);
    TextField sftrigram = new TextField("TRIGRAM", "", Field.Store.YES);
    StringField lang = new StringField("LANG", "", Field.Store.YES);
    d.add(nfid);// w  w w  . ja v  a 2s .  co m
    d.add(sforigin);
    d.add(normnws);
    d.add(sfunigram);
    d.add(sfbigram);
    d.add(sftrigram);

    String line;
    String[] column;
    String id, phrase, otherlang, oneOtherlang;
    int otherlangLength;
    int linen = 0;
    while ((line = br.readLine()) != null) {
        if (linen++ % 10000 == 0)
            System.out.println(linen + "\n" + line);
        column = line.trim().split("\t");

        // get other columns except for the location words
        id = column[0];
        phrase = column[1];
        otherlang = column[3];

        // To Do: set values to document d, and index it
        nfid.setStringValue(id);// 1
        // id does not change, change the strings to index.
        phrase = phrase.toLowerCase();
        sforigin.setStringValue(phrase);// 5
        normnws.setStringValue(phrase.replaceAll(" ", ""));

        getIndexFeatures(phrase);

        sfunigram.setStringValue(getUnigram());
        sfbigram.setStringValue(getBigram());
        sftrigram.setStringValue(getTrigram());

        lang.setStringValue("");
        // add this new document.
        iw.addDocument(d);

        if (otherlang.length() == 0)
            continue;
        String otherlangs[] = otherlang.split(",");
        otherlangLength = otherlangs.length;
        for (int i = 0; i < otherlangLength; i++) {
            if (otherlangs[i].length() == 0)
                continue;
            // id does not change, change the strings to index.
            oneOtherlang = otherlangs[i].toLowerCase();
            sforigin.setStringValue(oneOtherlang);// 5
            normnws.setStringValue(oneOtherlang.replaceAll(" ", ""));

            getIndexFeatures(oneOtherlang);

            sfunigram.setStringValue(getUnigram());
            sfbigram.setStringValue(getBigram());
            sftrigram.setStringValue(getTrigram());

            lang.setStringValue("");
            // add this new document.
            iw.addDocument(d);
        }
    }
}

From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazStringIndexerAltNames.java

License:Apache License

void indexAlterNames(BufferedReader br, IndexWriter iw) throws IOException, InterruptedException {

    Document d = new Document();
    StringField nfid = new StringField("ID", "", Field.Store.YES);
    StringField sforigin = new StringField("LOWERED_ORIGIN", "", Field.Store.YES);
    StringField normnws = new StringField("LOWERED-NO-WS", "", Field.Store.YES);
    TextField sfunigram = new TextField("UNIGRAM", "", Field.Store.YES);
    TextField sfbigram = new TextField("BIGRAM", "", Field.Store.YES);
    TextField sftrigram = new TextField("TRIGRAM", "", Field.Store.YES);
    StringField lang = new StringField("LANG", "", Field.Store.YES);
    d.add(nfid);//w w w  . jav  a  2s . c  o  m
    d.add(sforigin);
    d.add(normnws);
    d.add(sfunigram);
    d.add(sfbigram);
    d.add(sftrigram);

    String line;
    String[] column;
    String id, langOrLink, phrase;
    int linen = 0;
    while ((line = br.readLine()) != null) {
        if (linen++ % 10000 == 0)
            System.out.println(linen + "\n" + line);
        column = line.trim().split("\t");

        // get other columns except for the location words
        id = column[1];
        langOrLink = column[2];
        phrase = column[3];

        // To Do: set values to document d, and index it
        nfid.setStringValue(id);// 1

        phrase = phrase.toLowerCase();

        sforigin.setStringValue(phrase);// 5
        normnws.setStringValue(phrase.replaceAll(" ", ""));

        getIndexFeatures(phrase);

        sfunigram.setStringValue(getUnigram());
        sfbigram.setStringValue(getBigram());
        sftrigram.setStringValue(getTrigram());

        lang.setStringValue(langOrLink);
        // add this new document.
        iw.addDocument(d);
    }
}

From source file:edu.cmu.lti.huiying.ir.rangedsearch.TableIndexer.java

License:Apache License

public void indexExplodedXml(IndexWriter writer, File file) throws IOException {
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexExplodedXml(writer, new File(file, files[i]));
                }/*from   ww  w  .  j a  va2s .  co  m*/
            }
        } else {
            FileInputStream fis = new FileInputStream(file);
            try {
                NumericFeatureGenerator nfg = new NumericFeatureGenerator();
                if (this.xmlreader == null) {
                    this.xmlreader = new XmlStAXReader();
                }
                Article a = xmlreader.readArticleFromXml(file.getAbsolutePath());
                for (Table t : a.tables) {
                    for (Group g : t.groups) {
                        for (Column col : g.columns) {
                            // index columns
                            Document coldoc = new Document();
                            ArrayList<Double> cfv = nfg.getFeatureVector(col.content);
                            if (cfv.get(0) != null) {
                                DoubleField intratio = new DoubleField("intratio", cfv.get(0), Field.Store.NO);
                                coldoc.add(intratio);
                            }
                            if (cfv.get(1) != null) {
                                DoubleField floatratio = new DoubleField("floatratio", cfv.get(1),
                                        Field.Store.NO);
                                coldoc.add(floatratio);
                            }
                            if (cfv.get(3) != null) {
                                DoubleField mean = new DoubleField("mean", cfv.get(3), Field.Store.NO);
                                coldoc.add(mean);
                            }
                            if (cfv.get(4) != null) {
                                DoubleField std = new DoubleField("std", cfv.get(4), Field.Store.NO);
                                coldoc.add(std);
                            }
                            if (cfv.get(6) != null) {
                                DoubleField min = new DoubleField("min", cfv.get(6), Field.Store.NO);
                                coldoc.add(min);
                            }
                            if (cfv.get(7) != null) {
                                DoubleField max = new DoubleField("max", cfv.get(7), Field.Store.NO);
                                coldoc.add(max);
                            }
                            if (cfv.get(8) != null) {
                                DoubleField acc = new DoubleField("acc", cfv.get(8), Field.Store.NO);
                                coldoc.add(acc);
                            }
                            if (cfv.get(11) != null) {
                                DoubleField colmag = new DoubleField("colmag", cfv.get(11), Field.Store.NO);
                                coldoc.add(colmag);
                            }

                            StringField wholegroup = new StringField("wholegroup", g.toString(),
                                    Field.Store.YES);
                            if (wholegroup.stringValue().getBytes().length > 32760) {
                                wholegroup.setStringValue("Table too large...");
                                System.err.println(
                                        "table too large:" + wholegroup.stringValue().getBytes().length);

                            }
                            String headers = "";
                            if (col.headers != null) {
                                for (Header hdr : col.headers) {
                                    headers += hdr.text.toLowerCase() + " ";
                                }
                            }
                            TextField header = new TextField("headerkeywords", headers.trim(), Field.Store.NO);
                            coldoc.add(header);
                            coldoc.add(wholegroup);
                            StringField fname = new StringField("filename", file.getAbsolutePath(),
                                    Field.Store.YES);
                            coldoc.add(fname);
                            StringField type = new StringField("type", "column", Field.Store.YES);
                            coldoc.add(type);
                            IntField bstart = new IntField("bytestart", col.content.get(0).byteStart,
                                    Field.Store.YES);
                            IntField bend = new IntField("byteend",
                                    col.content.get(col.content.size() - 1).byteEnd, Field.Store.YES);
                            String content = "";
                            for (edu.cmu.lti.huiying.domainclasses.Field f : col.content)
                                content += f.text + "|";
                            StringField colcontent = new StringField("colcontent",
                                    content.substring(0, content.length() - 1), Field.Store.YES);
                            coldoc.add(colcontent);
                            coldoc.add(bstart);
                            coldoc.add(bend);
                            if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                                writer.addDocument(coldoc);
                                totalDocAdded++;
                            } else {
                                writer.updateDocument(new Term("path", file.getPath()), coldoc);
                            }
                            for (edu.cmu.lti.huiying.domainclasses.Field f : col.content) {
                                // Index single cell
                                Document celldoc = new Document();
                                ArrayList<Double> fv = nfg.field2Features(f);
                                if (fv.get(0) == 1 || fv.get(0) == 2) {
                                    try {
                                        DoubleField df = new DoubleField("value", fv.get(1), Field.Store.YES);
                                        celldoc.add(df);
                                        StringField textf = new StringField("text", f.text, Field.Store.YES);
                                        celldoc.add(textf);
                                        if (fv.get(2) != null & fv.get(2) != Double.NaN) {
                                            DoubleField errf = new DoubleField("error", fv.get(2),
                                                    Field.Store.NO);
                                            celldoc.add(errf);
                                        }
                                        if (fv.get(5) != Double.NaN) {
                                            DoubleField magf = new DoubleField("cellmag", fv.get(5),
                                                    Field.Store.NO);
                                            celldoc.add(magf);
                                        }
                                        if (fv.get(4) != null) {
                                            DoubleField pvalue = new DoubleField("cellpvalue", fv.get(4),
                                                    Field.Store.NO);
                                            celldoc.add(pvalue);
                                        }
                                        StringField sf = new StringField("filename", file.getAbsolutePath(),
                                                Field.Store.YES);
                                        celldoc.add(sf);

                                        StringField ctype = new StringField("type", "cell", Field.Store.YES);
                                        celldoc.add(ctype);
                                        //StringField cwholegroup=new StringField("wholegroup", g.toString(), Field.Store.YES);
                                        //celldoc.add(cwholegroup);
                                        IntField cbstart = new IntField("bytestart", f.byteStart,
                                                Field.Store.YES);
                                        IntField cbend = new IntField("byteend", f.byteEnd, Field.Store.YES);
                                        celldoc.add(cbstart);
                                        celldoc.add(cbend);
                                    } catch (NullPointerException e) {
                                        e.printStackTrace();
                                        System.out.println(f.text);
                                    }
                                    if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                                        writer.addDocument(celldoc);
                                        totalDocAdded++;
                                    } else {
                                        writer.updateDocument(new Term("path", file.getPath()), celldoc);
                                    }
                                }
                            }
                        }
                    }
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:edu.cmu.lti.huiying.ir.rangedsearch.TableIndexer.java

License:Apache License

public void indexOffsetAnnotation(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    if (files[i].equals("NeuroScience.num.offset"))
                        indexOffsetAnnotation(writer, new File(file, files[i]));
                }//from w  w  w  .j  av  a 2 s .c o m
            }
        } else {
            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();
                BufferedReader br = new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8));
                String line = null;
                String filename = null;
                while ((line = br.readLine()) != null) {
                    if (line.trim().length() == 0) {
                        doc.add((new StringField("filename", filename, Field.Store.YES)));
                        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                            writer.addDocument(doc);
                        } else {
                            System.out.println("updating " + file);
                            writer.updateDocument(new Term("path", file.getPath()), doc);
                        }
                        doc = new Document();
                        filename = null;
                        continue;
                    }
                    String[] spl = line.split("\t");
                    doc.add(new DoubleField(spl[3], Double.parseDouble(spl[5]), Field.Store.YES));
                    if (filename == null)
                        filename = spl[0];
                }
                br.close();
            } finally {
                fis.close();
            }
        }
    }
}

From source file:edu.cmu.lti.oaqa.knn4qa.apps.LuceneIndexer.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();

    options.addOption(CommonParams.ROOT_DIR_PARAM, null, true, CommonParams.ROOT_DIR_DESC);
    options.addOption(CommonParams.SUB_DIR_TYPE_PARAM, null, true, CommonParams.SUB_DIR_TYPE_DESC);
    options.addOption(CommonParams.MAX_NUM_REC_PARAM, null, true, CommonParams.MAX_NUM_REC_DESC);
    options.addOption(CommonParams.SOLR_FILE_NAME_PARAM, null, true, CommonParams.SOLR_FILE_NAME_DESC);
    options.addOption(CommonParams.OUT_INDEX_PARAM, null, true, CommonParams.OUT_MINDEX_DESC);

    CommandLineParser parser = new org.apache.commons.cli.GnuParser();

    try {/*ww w .  j  a v a  2 s.c  om*/
        CommandLine cmd = parser.parse(options, args);

        String rootDir = null;

        rootDir = cmd.getOptionValue(CommonParams.ROOT_DIR_PARAM);

        if (null == rootDir)
            Usage("Specify: " + CommonParams.ROOT_DIR_DESC, options);

        String outputDirName = cmd.getOptionValue(CommonParams.OUT_INDEX_PARAM);

        if (null == outputDirName)
            Usage("Specify: " + CommonParams.OUT_MINDEX_DESC, options);

        String subDirTypeList = cmd.getOptionValue(CommonParams.SUB_DIR_TYPE_PARAM);

        if (null == subDirTypeList || subDirTypeList.isEmpty())
            Usage("Specify: " + CommonParams.SUB_DIR_TYPE_DESC, options);

        String solrFileName = cmd.getOptionValue(CommonParams.SOLR_FILE_NAME_PARAM);
        if (null == solrFileName)
            Usage("Specify: " + CommonParams.SOLR_FILE_NAME_DESC, options);

        int maxNumRec = Integer.MAX_VALUE;

        String tmp = cmd.getOptionValue(CommonParams.MAX_NUM_REC_PARAM);

        if (tmp != null) {
            try {
                maxNumRec = Integer.parseInt(tmp);
                if (maxNumRec <= 0) {
                    Usage("The maximum number of records should be a positive integer", options);
                }
            } catch (NumberFormatException e) {
                Usage("The maximum number of records should be a positive integer", options);
            }
        }

        File outputDir = new File(outputDirName);
        if (!outputDir.exists()) {
            if (!outputDir.mkdirs()) {
                System.out.println("couldn't create " + outputDir.getAbsolutePath());
                System.exit(1);
            }
        }
        if (!outputDir.isDirectory()) {
            System.out.println(outputDir.getAbsolutePath() + " is not a directory!");
            System.exit(1);
        }
        if (!outputDir.canWrite()) {
            System.out.println("Can't write to " + outputDir.getAbsolutePath());
            System.exit(1);
        }

        String subDirs[] = subDirTypeList.split(",");

        int docNum = 0;

        // No English analyzer here, all language-related processing is done already,
        // here we simply white-space tokenize and index tokens verbatim.
        Analyzer analyzer = new WhitespaceAnalyzer();
        FSDirectory indexDir = FSDirectory.open(outputDir);
        IndexWriterConfig indexConf = new IndexWriterConfig(analyzer.getVersion(), analyzer);

        System.out.println("Creating a new Lucene index, maximum # of docs to process: " + maxNumRec);
        indexConf.setOpenMode(OpenMode.CREATE);
        IndexWriter indexWriter = new IndexWriter(indexDir, indexConf);

        for (int subDirId = 0; subDirId < subDirs.length && docNum < maxNumRec; ++subDirId) {
            String inputFileName = rootDir + "/" + subDirs[subDirId] + "/" + solrFileName;

            System.out.println("Input file name: " + inputFileName);

            BufferedReader inpText = new BufferedReader(
                    new InputStreamReader(CompressUtils.createInputStream(inputFileName)));
            String docText = XmlHelper.readNextXMLIndexEntry(inpText);

            for (; docText != null && docNum < maxNumRec; docText = XmlHelper.readNextXMLIndexEntry(inpText)) {
                ++docNum;
                Map<String, String> docFields = null;

                Document luceneDoc = new Document();

                try {
                    docFields = XmlHelper.parseXMLIndexEntry(docText);
                } catch (Exception e) {
                    System.err.println(String.format("Parsing error, offending DOC #%d:\n%s", docNum, docText));
                    System.exit(1);
                }

                String id = docFields.get(UtilConst.TAG_DOCNO);

                if (id == null) {
                    System.err.println(String.format("No ID tag '%s', offending DOC #%d:\n%s",
                            UtilConst.TAG_DOCNO, docNum, docText));
                }

                luceneDoc.add(new StringField(UtilConst.TAG_DOCNO, id, Field.Store.YES));

                for (Map.Entry<String, String> e : docFields.entrySet())
                    if (!e.getKey().equals(UtilConst.TAG_DOCNO)) {
                        luceneDoc.add(new TextField(e.getKey(), e.getValue(), Field.Store.YES));
                    }
                indexWriter.addDocument(luceneDoc);
                if (docNum % 1000 == 0)
                    System.out.println("Indexed " + docNum + " docs");
            }
            System.out.println("Indexed " + docNum + " docs");
        }

        indexWriter.commit();
        indexWriter.close();

    } catch (ParseException e) {
        Usage("Cannot parse arguments", options);
    } catch (Exception e) {
        System.err.println("Terminating due to an exception: " + e);
        System.exit(1);
    }

}