Example usage for org.apache.lucene.index IndexWriter updateDocument

List of usage examples for org.apache.lucene.index IndexWriter updateDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter updateDocument.

Prototype

private long updateDocument(final DocumentsWriterDeleteQueue.Node<?> delNode,
            Iterable<? extends IndexableField> doc) throws IOException 

Source Link

Usage

From source file:index.IndexOmimtsv.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);/*  w  ww .  j  ava2s .com*/

        // Add the last modified date of the file a field named "modified".
        // Use a LongPoint that is indexed (i.e. efficiently filterable with
        // PointRangeQuery).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongPoint("20110227030432", lastModified));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        InputStreamReader ipsr = new InputStreamReader(stream);
        BufferedReader br = new BufferedReader(ipsr);
        String line = br.readLine();
        int cpt = 0;
        while ((line = br.readLine()) != null) {
            String[] tokens = line.trim().split("\t");
            if (tokens.length > 6) {
                String id = tokens[0].split("/")[tokens[0].split("/").length - 1].trim();
                if (id.matches("^[0-9]*")) {
                    doc = new Document();
                    cpt++;
                    doc.add(new TextField("ID", id, Field.Store.NO));
                    if (!tokens[5].trim().matches("^C[0-9].*")) {
                        for (String token : tokens) {
                            if (token.trim().matches("^C[0-9].*")) {
                                doc.add(new StoredField("CUI", token.trim()));
                                break;
                            }
                        }
                        if (doc.getFields().size() != 2)
                            doc.add(new StoredField("CUI", ""));
                    } else
                        doc.add(new StoredField("CUI", tokens[5].trim()));
                    doc.add(new StoredField("Label", tokens[1].trim()));
                    writer.addDocument(doc);
                }
            }
        }
        System.out.println("Nombre d'lments : " + cpt);
        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(
                    new Term("F:/Ecole(Telecom)/cours telecom/Projet_GMD/bases/chemical.sources.v5.0.tsv",
                            file.toString()),
                    doc);
        }
    }
}

From source file:indexer.LuceneIndexerAddDocument.java

/**
 * Indexes a single document with the aid of Apache Tika.
 *
 * @param writer Writer to the index where the given file/dir info will be
 * stored.//from  w ww  . ja  v  a  2 s  .  c  o  m
 * @param file The file to index, or the directory to recurse into to find
 * files to index.
 * @param attrs This is the attributes from the given file gathered from
 * walking the file tree.
 * @param global This is for reference to the global class variables and
 * methods.
 * @throws IOException
 */
static void indexDoc(IndexWriter writer, Path file, BasicFileAttributes attrs, Global global)
        throws IOException {
    File document = file.toFile();
    if (document.renameTo(document)) {
        try (InputStream stream = Files.newInputStream(file)) {

            //make a new, empty document
            Document doc = new Document();

            //Add the path of the file as a field named "path".
            Field pathField = new StringField("path", file.toString(), Field.Store.YES);
            doc.add(pathField);

            //Add the last modified date of the file as a field named "modified".
            doc.add(new LongField("modified", attrs.lastModifiedTime().toMillis(), Field.Store.YES));

            //Add the created date of the file as a field named "created".
            doc.add(new LongField("created", attrs.creationTime().toMillis(), Field.Store.YES));

            //Add the document File Name
            doc.add(new StringField("filename", file.getFileName().toString(), Field.Store.YES));

            //Add the contents of the file as a field named "vcontents". 
            //Parser type for Tika
            BodyContentHandler handler = new BodyContentHandler(global.WRITE_LIMIT);
            Metadata metadata = new Metadata();
            FileInputStream inputstream = new FileInputStream(new File(file.toString()));
            ParseContext pcontext = new ParseContext();

            //New Field Type
            FieldType bodyType = new FieldType();
            bodyType.setStored(true);
            bodyType.setTokenized(true);
            // for Highlighter, FastvectorHighlighter
            bodyType.setStoreTermVectors(true);
            bodyType.setStoreTermVectorPositions(true);
            bodyType.setStoreTermVectorOffsets(true);
            // for PostingsHighlighter
            bodyType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);

            /**
             * Determine the document type and the proper parser for the
             * document After the document is determined we grab the content
             * and position offset for highlighting.
             */
            try {
                if (file.toString().endsWith(".pdf")) {
                    PDFParser pdfparser = new PDFParser();
                    pdfparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".docx") || file.toString().endsWith(".pptx")
                        || file.toString().endsWith(".xlsx") || file.toString().endsWith(".docm")
                        || file.toString().endsWith(".pptm") || file.toString().endsWith(".xlsm")) {
                    OOXMLParser msofficeparser = new OOXMLParser();
                    msofficeparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".doc") || file.toString().endsWith(".ppt")
                        || file.toString().endsWith(".xlx")) {
                    OfficeParser msofficeparser = new OfficeParser();
                    msofficeparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".odt") || file.toString().endsWith(".odp")
                        || file.toString().endsWith(".ods")) {
                    OpenDocumentParser openofficeparser = new OpenDocumentParser();
                    openofficeparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".epub")) {
                    EpubParser epubParser = new EpubParser();
                    epubParser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".xml")) {
                    XMLParser XMLparser = new XMLParser();
                    XMLparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".htm") || file.toString().endsWith(".html")
                        || file.toString().endsWith(".mhtml")) {
                    HtmlParser HTMLparser = new HtmlParser();
                    HTMLparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".rtf")) {
                    RTFParser RTFparser = new RTFParser();
                    RTFparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".txt")) {
                    TXTParser TXTparser = new TXTParser();
                    TXTparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else {
                    BufferedReader buffedRead = new BufferedReader(
                            new InputStreamReader(stream, StandardCharsets.UTF_8));
                    doc.add(new TextField("vcontent", buffedRead));
                }
            } catch (SAXException | TikaException ex) {
                log.fatal("Document Parsing Exception");
            }

            if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
                // New index, so we just add the document (no old document can be there):
                writer.addDocument(doc);
                System.out.println("adding " + file);
            } else {
                /**
                 * Existing index (an old copy of this document may have
                 * been indexed) so we use updateDocument instead to replace
                 * the old one matching the exact path, if present:
                 */
                writer.updateDocument(new Term("path", file.toString()), doc);
                System.out.println("updating " + file);
            }
        }
    } else {
        System.out.println("LOCKED: " + file);
    }
}

From source file:InformationRetrieval.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * /* www .  j  a  v a  2  s .c  om*/
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);
                BufferedReader br = new BufferedReader(new FileReader(file));
                String L1 = br.readLine();
                String L2 = br.readLine();
                Field Title = new TextField("Title", L2, Field.Store.YES);
                Title.setBoost(90F);
                doc.add(Title);
                String s1 = null, s2 = null, s3 = null;
                while (br.readLine() != null) {
                    s1 = br.readLine();
                    break;
                }
                String snip = s1 + " " + s2 + " " + s3;

                Field Snippet = new StringField("Snippet", snip, Field.Store.YES);
                doc.add(Snippet);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:io.anserini.index.IndexTweetsUpdatePlace.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(HELP_OPTION, "show help"));
    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
    options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors"));

    options.addOption(OptionBuilder.withArgName("collection").hasArg()
            .withDescription("source collection directory").create(COLLECTION_OPTION));
    options.addOption(/*from   w  w w  . ja  v  a 2  s . c  o  m*/
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids")
            .create(DELETES_OPTION));
    options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION)
            || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(IndexTweetsUpdatePlace.class.getName(), options);
        System.exit(-1);
    }

    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexPath = cmdline.getOptionValue(INDEX_OPTION);

    System.out.println(collectionPath + " " + indexPath);

    LOG.info("collection: " + collectionPath);
    LOG.info("index: " + indexPath);

    long startTime = System.currentTimeMillis();
    File file = new File(collectionPath);
    if (!file.exists()) {
        System.err.println("Error: " + file + " does not exist!");
        System.exit(-1);
    }

    final FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) {
        textOptions.setStoreTermVectors(true);

    }

    final StatusStream stream = new JsonStatusCorpusReader(file);

    final Directory dir = new SimpleFSDirectory(Paths.get(cmdline.getOptionValue(INDEX_OPTION)));
    final IndexWriterConfig config = new IndexWriterConfig(ANALYZER);

    config.setOpenMode(IndexWriterConfig.OpenMode.APPEND);

    final IndexWriter writer = new IndexWriter(dir, config);
    System.out.print("Original # of docs " + writer.numDocs());
    int updateCount = 0;

    Runtime.getRuntime().addShutdownHook(new Thread() {
        public void run() {

            try {
                stream.close();
            } catch (IOException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }

            try {
                writer.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            try {
                dir.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            System.out.println("Shutting down");

        }
    });
    int cnt = 0;
    Status status;
    try {
        while ((status = stream.next()) != null) {

            if (status.getPlace() != null) {

                //               Query q = NumericRangeQuery.newLongRange(TweetStreamReader.StatusField.ID.name, status.getId(),
                //                     status.getId(), true, true);
                //               System.out.print("Deleting docCount="+writer.numDocs());
                //               writer.deleteDocuments(q);
                //               writer.commit();
                //               System.out.print(" Deleted docCount="+writer.numDocs());

                Document doc = new Document();
                doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES));
                doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES));
                doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));

                doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions));

                doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES));
                doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES));
                doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES));
                doc.add(new DoubleField(StatusField.LONGITUDE.name, status.getLongitude(), Store.YES));
                doc.add(new DoubleField(StatusField.LATITUDE.name, status.getlatitude(), Store.YES));
                doc.add(new StringField(StatusField.PLACE.name, status.getPlace(), Store.YES));
                long inReplyToStatusId = status.getInReplyToStatusId();
                if (inReplyToStatusId > 0) {
                    doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId,
                            Field.Store.YES));
                    doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(),
                            Field.Store.YES));
                }

                String lang = status.getLang();
                if (!lang.equals("unknown")) {
                    doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
                }

                long retweetStatusId = status.getRetweetedStatusId();
                if (retweetStatusId > 0) {
                    doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId,
                            Field.Store.YES));
                    doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(),
                            Field.Store.YES));
                    doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES));
                    if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
                        LOG.warn("Error parsing retweet fields of " + status.getId());
                    }
                }

                long id = status.getId();
                BytesRefBuilder brb = new BytesRefBuilder();
                NumericUtils.longToPrefixCodedBytes(id, 0, brb);
                Term term = new Term(StatusField.ID.name, brb.get());
                writer.updateDocument(term, doc);

                //               writer.addDocument(doc);

                updateCount += 1;

                if (updateCount % 10000 == 0) {

                    LOG.info(updateCount + " statuses updated");
                    writer.commit();
                    System.out.println("Updated docCount=" + writer.numDocs());
                }

            }

        }

        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        stream.close();
    }
}

From source file:io.datalayer.lucene.index.IndexerMain.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is
 * given, recurses over files and directories found under the given
 * directory.//from  w  ww.  jav  a 2s  .  co m
 * 
 * NOTE: This method indexes one document per input file. This is slow. For
 * good throughput, put multiple documents into your input file(s). An
 * example of this is in the benchmark module, which can create "line doc"
 * files, one document per line, using the <a href=
 * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 * 
 * @param writer
 *            Writer to the index where the given file/dir info will be
 *            stored
 * @param file
 *            The file to index, or the directory to recurse into to find
 *            files to index
 * @throws IOException
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {

    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this
                // exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path". Use a
                // field that is indexed (i.e. searchable), but don't
                // tokenize
                // the field into separate words and don't index term
                // frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named
                // "modified".
                // Use a LongField that is indexed (i.e. efficiently
                // filterable with
                // NumericRangeFilter). This indexes to milli-second
                // resolution, which
                // is often too fine. You could instead create a number
                // based on
                // year/month/day/hour/minutes/seconds, down the resolution
                // you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".
                // Specify a Reader,
                // so that the text of the file is tokenized and indexed,
                // but not stored.
                // Note that FileReader expects the file to be in UTF-8
                // encoding.
                // If that's not the case searching for special characters
                // will fail.
                /*
                 * doc.add(new TextField("contents", new BufferedReader(new
                 * InputStreamReader(fis, "UTF-8")), Field.Store.NO));
                 */
                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document can be there):
                    LOGGER.info("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been indexed) so
                    // we use updateDocument instead to replace the old one
                    // matching the exact
                    // path, if present:
                    LOGGER.info("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }

    }

}

From source file:io.datalayer.lucene.index.LuceneLifecycleTest.java

License:Apache License

/**
 * #A 2 docs in the index//from w  ww  .  ja v a 2 s . c  o m
 * 
 * #B Delete first document
 * 
 * #C 1 indexed document, 0 deleted documents
 */
@Test
public void testUpdate() throws IOException {

    assertEquals(1, getHitCount("city", "Amsterdam"));

    IndexWriter writer = getWriter();

    Document doc = new Document();
    doc.add(new StoredField("id", "1"));
    doc.add(new StoredField("country", "Netherlands"));
    doc.add(new Field("contents", "Den Haag has a lot of museums", AosFieldType.INDEXED_STOREDNOT_TERMVECTORS));
    doc.add(new Field("city", "Den Haag", AosFieldType.INDEXED_STOREDNOT_TERMVECTORS));

    writer.updateDocument(new Term("id", "1"), doc);
    writer.close();

    assertEquals(0, getHitCount("city", "Amsterdam"));
    assertEquals(1, getHitCount("city", "Haag"));
}

From source file:io.jpress.searcher.LuceneSearcher.java

License:LGPL

@Override
public void updateBean(SearcherBean bean) {
    try {/*  w  w w .  j  a v a  2s  . c om*/
        IndexWriter indexWriter = createIndexWriter();
        Term term = new Term("sid", bean.getSid());
        indexWriter.updateDocument(term, createDocument(bean));
        indexWriter.commit();
        indexWriter.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:io.seldon.semvec.CreateLuceneIndexFromDb.java

License:Apache License

private void saveDocument(long id, IndexSearcher reader, IndexWriter writer, BufferedWriter fileWriter)
        throws CorruptIndexException, IOException {
    String path;//  w w w.j a v  a2 s  . co m
    if (rawIds)
        path = "" + id;
    else
        path = sequentialIds ? toSV(this.seqId++) : toSV(id);
    if (reader != null && appendOnly) {
        Term docPathTerm = new Term(FIELD_PATH, path);
        TermQuery tq = new TermQuery(docPathTerm);
        int hits = reader.search(tq, 1).totalHits;
        if (hits > 0) {
            if (debug)
                System.out.println("Skipping existing doc with id " + id);
            return; // document exists so don't do anything
        }
    }
    String comments = null;
    String nlpComments = null;
    switch (this.extractionMethod) {
    case COMMENTS:
        comments = docStore.getComments(id);
        break;
    case ITEM_ATTR:
        if (attrIds != null && attrIds.length > 0)
            comments = docStore.getItemTextualById(id, new HashSet<Integer>(Arrays.asList(attrIds)));
        else if (attrNames != null && attrNames.length > 0)
            comments = docStore.getItemTextualByName(id, new HashSet<String>(Arrays.asList(attrNames)));
        else
            comments = docStore.getItemTextual(id);
        if (nlpAttrIds != null && nlpAttrIds.length > 0)
            nlpComments = docStore.getItemTextualById(id, new HashSet<Integer>(Arrays.asList(nlpAttrIds)));
        break;
    case USERS:
        comments = docStore.getUserItems(id, useItemIds);
        break;
    case USER_ACTIONS:
        comments = docStore.getUserActionAttrs(id, new HashSet<Integer>(Arrays.asList(attrIds)));
        break;
    case USER_DIM:
        comments = docStore.getDimTextual(id, new HashSet<Integer>(Arrays.asList(textAttrIds)), itemLimit);
        break;
    }

    if (comments != null) {
        if (this.removeHtml) {
            System.out.println("removing html");
            Source source = new Source(comments);
            comments = source.getTextExtractor().toString();

            if (nlpComments != null) {
                source = new Source(comments);
                nlpComments = source.getTextExtractor().toString();
            }
        }

        comments = comments.replaceAll("\\,|\\.|\\!|\\;|\\/", " ");
        if (transLiterate) {
            System.out.println("removing punctuation");
            comments = TransliteratorPeer.getPunctuationTransLiterator().transliterate(comments);
        }

        if (addEntities != null && nlpComments == null)
            comments = addEntities.process(comments);
        else if (addEntities != null && nlpComments != null) {
            nlpComments = addEntities.process(nlpComments);
            //System.out.println("NLP Comments:["+nlpComments+"]");
            //System.out.println("Existing comments:"+comments);
            comments = comments + " " + nlpComments;
        } else {
            comments = comments.replaceAll("\\,|\\.|\\!|\\;|\\/", " ");
            if (transLiterate) {
                System.out.println("removing punctuation");
                comments = TransliteratorPeer.getPunctuationTransLiterator().transliterate(comments);
            }
        }
        comments = comments.replaceAll("\\|", "");
        comments = comments.trim();
        String[] tokens = comments.split(" ");
        if (!"".equals(comments) && tokens.length >= minTokens) {
            if (debug)
                System.out.println("adding document for id " + id + " with text:[" + comments + "]");
            if (reader != null) {
                Term docPathTerm = new Term(FIELD_PATH, path);
                TermQuery tq = new TermQuery(docPathTerm);
                int hits = reader.search(tq, 1).totalHits;
                if (hits > 0) // doc exists in index (assumes a unique match...)
                    writer.updateDocument(docPathTerm, createDoc(path, comments));
                else
                    writer.addDocument(createDoc(path, comments));
            } else
                writer.addDocument(createDoc(path, comments));

            if (fileWriter != null) {
                if (yahooLDAfile != null) {
                    fileWriter.write("" + id);
                    fileWriter.write(" ");
                    fileWriter.write(path);
                    fileWriter.write(" ");
                    fileWriter.write(comments);
                    fileWriter.write("\n");
                } else {
                    fileWriter.write("" + id);
                    fileWriter.write(",");
                    fileWriter.write(comments);
                    fileWriter.write("\n");
                }
            }
        } else
            System.out.println("Skipping document with id " + id + " of token length " + tokens.length);
    }
}

From source file:l3.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given, recurses over files and directories
 * found under the given directory./*from w w w .j  a va 2  s  .  c o  m*/
 * 
 * NOTE: This method indexes one document per input file. This is slow. For good throughput, put multiple documents
 * into your input file(s). An example of this is in the benchmark module, which can create "line doc" files, one
 * document per line, using the <a
 * href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 * 
 * @param writer
 *            Writer to the index where the given file/dir info will be stored
 * @param file
 *            The file to index, or the directory to recurse into to find files to index
 * @throws IOException
 *             If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path". Use a
                // field that is indexed (i.e. searchable), but don't tokenize
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter). This indexes to milli-second resolution, which
                // is often too fine. You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents". Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so
                    // we use updateDocument instead to replace the old one matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:lia.chapter2.IndexingTest.java

License:Apache License

@Test
public void testUpdate() throws IOException {

    assertEquals(1, getHitCount("city", "Amsterdam"));

    IndexWriter writer = getWriter();

    Document doc = new Document(); //A
    doc.add(new Field("id", "1", StringField.TYPE_STORED)); //A
    doc.add(new Field("country", "Netherlands", StringField.TYPE_STORED)); //A
    doc.add(new Field("contents", "Den Haag has a lot of museums", TextField.TYPE_STORED)); //A
    doc.add(new Field("city", "Den Haag", TextField.TYPE_STORED)); //A

    writer.updateDocument(new Term("id", "1"), //B
            doc); //B
    writer.close();// ww w .  java2  s.c  o m

    assertEquals(0, getHitCount("city", "Amsterdam"));//C
    assertEquals(1, getHitCount("city", "Haag")); //D
}