Example usage for org.apache.lucene.index IndexWriter updateDocument

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter updateDocument.

Prototype

private long updateDocument(final DocumentsWriterDeleteQueue.Node<?> delNode,
            Iterable<? extends IndexableField> doc) throws IOException

Source Link

Usage

From source file:index.IndexOmimtsv.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
    try (InputStream stream = Files.newInputStream(file)) {
        // make a new, empty document
        Document doc = new Document();

        // Add the path of the file as a field named "path".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field pathField = new StringField("path", file.toString(), Field.Store.YES);
        doc.add(pathField);/*  w  ww .  j  ava2s .com*/

        // Add the last modified date of the file a field named "modified".
        // Use a LongPoint that is indexed (i.e. efficiently filterable with
        // PointRangeQuery).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongPoint("20110227030432", lastModified));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        InputStreamReader ipsr = new InputStreamReader(stream);
        BufferedReader br = new BufferedReader(ipsr);
        String line = br.readLine();
        int cpt = 0;
        while ((line = br.readLine()) != null) {
            String[] tokens = line.trim().split("\t");
            if (tokens.length > 6) {
                String id = tokens[0].split("/")[tokens[0].split("/").length - 1].trim();
                if (id.matches("^[0-9]*")) {
                    doc = new Document();
                    cpt++;
                    doc.add(new TextField("ID", id, Field.Store.NO));
                    if (!tokens[5].trim().matches("^C[0-9].*")) {
                        for (String token : tokens) {
                            if (token.trim().matches("^C[0-9].*")) {
                                doc.add(new StoredField("CUI", token.trim()));
                                break;
                            }
                        }
                        if (doc.getFields().size() != 2)
                            doc.add(new StoredField("CUI", ""));
                    } else
                        doc.add(new StoredField("CUI", tokens[5].trim()));
                    doc.add(new StoredField("Label", tokens[1].trim()));
                    writer.addDocument(doc);
                }
            }
        }
        System.out.println("Nombre d'lments : " + cpt);
        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + file);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            System.out.println("updating " + file);
            writer.updateDocument(
                    new Term("F:/Ecole(Telecom)/cours telecom/Projet_GMD/bases/chemical.sources.v5.0.tsv",
                            file.toString()),
                    doc);
        }
    }
}

From source file:indexer.LuceneIndexerAddDocument.java

/**
 * Indexes a single document with the aid of Apache Tika.
 *
 * @param writer Writer to the index where the given file/dir info will be
 * stored.//from  w ww  . ja  v  a  2 s  .  c  o  m
 * @param file The file to index, or the directory to recurse into to find
 * files to index.
 * @param attrs This is the attributes from the given file gathered from
 * walking the file tree.
 * @param global This is for reference to the global class variables and
 * methods.
 * @throws IOException
 */
static void indexDoc(IndexWriter writer, Path file, BasicFileAttributes attrs, Global global)
        throws IOException {
    File document = file.toFile();
    if (document.renameTo(document)) {
        try (InputStream stream = Files.newInputStream(file)) {

            //make a new, empty document
            Document doc = new Document();

            //Add the path of the file as a field named "path".
            Field pathField = new StringField("path", file.toString(), Field.Store.YES);
            doc.add(pathField);

            //Add the last modified date of the file as a field named "modified".
            doc.add(new LongField("modified", attrs.lastModifiedTime().toMillis(), Field.Store.YES));

            //Add the created date of the file as a field named "created".
            doc.add(new LongField("created", attrs.creationTime().toMillis(), Field.Store.YES));

            //Add the document File Name
            doc.add(new StringField("filename", file.getFileName().toString(), Field.Store.YES));

            //Add the contents of the file as a field named "vcontents". 
            //Parser type for Tika
            BodyContentHandler handler = new BodyContentHandler(global.WRITE_LIMIT);
            Metadata metadata = new Metadata();
            FileInputStream inputstream = new FileInputStream(new File(file.toString()));
            ParseContext pcontext = new ParseContext();

            //New Field Type
            FieldType bodyType = new FieldType();
            bodyType.setStored(true);
            bodyType.setTokenized(true);
            // for Highlighter, FastvectorHighlighter
            bodyType.setStoreTermVectors(true);
            bodyType.setStoreTermVectorPositions(true);
            bodyType.setStoreTermVectorOffsets(true);
            // for PostingsHighlighter
            bodyType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);

            /**
             * Determine the document type and the proper parser for the
             * document After the document is determined we grab the content
             * and position offset for highlighting.
             */
            try {
                if (file.toString().endsWith(".pdf")) {
                    PDFParser pdfparser = new PDFParser();
                    pdfparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".docx") || file.toString().endsWith(".pptx")
                        || file.toString().endsWith(".xlsx") || file.toString().endsWith(".docm")
                        || file.toString().endsWith(".pptm") || file.toString().endsWith(".xlsm")) {
                    OOXMLParser msofficeparser = new OOXMLParser();
                    msofficeparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".doc") || file.toString().endsWith(".ppt")
                        || file.toString().endsWith(".xlx")) {
                    OfficeParser msofficeparser = new OfficeParser();
                    msofficeparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".odt") || file.toString().endsWith(".odp")
                        || file.toString().endsWith(".ods")) {
                    OpenDocumentParser openofficeparser = new OpenDocumentParser();
                    openofficeparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".epub")) {
                    EpubParser epubParser = new EpubParser();
                    epubParser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".xml")) {
                    XMLParser XMLparser = new XMLParser();
                    XMLparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".htm") || file.toString().endsWith(".html")
                        || file.toString().endsWith(".mhtml")) {
                    HtmlParser HTMLparser = new HtmlParser();
                    HTMLparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".rtf")) {
                    RTFParser RTFparser = new RTFParser();
                    RTFparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else if (file.toString().endsWith(".txt")) {
                    TXTParser TXTparser = new TXTParser();
                    TXTparser.parse(inputstream, handler, metadata, pcontext);
                    doc.add(new Field("vcontent", handler.toString(), bodyType));
                } else {
                    BufferedReader buffedRead = new BufferedReader(
                            new InputStreamReader(stream, StandardCharsets.UTF_8));
                    doc.add(new TextField("vcontent", buffedRead));
                }
            } catch (SAXException | TikaException ex) {
                log.fatal("Document Parsing Exception");
            }

            if (writer.getConfig().getOpenMode() == IndexWriterConfig.OpenMode.CREATE) {
                // New index, so we just add the document (no old document can be there):
                writer.addDocument(doc);
                System.out.println("adding " + file);
            } else {
                /**
                 * Existing index (an old copy of this document may have
                 * been indexed) so we use updateDocument instead to replace
                 * the old one matching the exact path, if present:
                 */
                writer.updateDocument(new Term("path", file.toString()), doc);
                System.out.println("updating " + file);
            }
        }
    } else {
        System.out.println("LOCKED: " + file);
    }
}

From source file:InformationRetrieval.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * /* www .  j  a  v a  2  s .c  om*/
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path".  Use a
                // field that is indexed (i.e. searchable), but don't tokenize 
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);
                BufferedReader br = new BufferedReader(new FileReader(file));
                String L1 = br.readLine();
                String L2 = br.readLine();
                Field Title = new TextField("Title", L2, Field.Store.YES);
                Title.setBoost(90F);
                doc.add(Title);
                String s1 = null, s2 = null, s3 = null;
                while (br.readLine() != null) {
                    s1 = br.readLine();
                    break;
                }
                String snip = s1 + " " + s2 + " " + s3;

                Field Snippet = new StringField("Snippet", snip, Field.Store.YES);
                doc.add(Snippet);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter).  This indexes to milli-second resolution, which
                // is often too fine.  You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".  Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so 
                    // we use updateDocument instead to replace the old one matching the exact 
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:io.anserini.index.IndexTweetsUpdatePlace.java

License:Apache License

@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(new Option(HELP_OPTION, "show help"));
    options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment"));
    options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors"));

    options.addOption(OptionBuilder.withArgName("collection").hasArg()
            .withDescription("source collection directory").create(COLLECTION_OPTION));
    options.addOption(/*from   w  w w  . ja  v  a 2  s . c  o  m*/
            OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids")
            .create(DELETES_OPTION));
    options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION));

    CommandLine cmdline = null;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        System.exit(-1);
    }

    if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION)
            || !cmdline.hasOption(INDEX_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(IndexTweetsUpdatePlace.class.getName(), options);
        System.exit(-1);
    }

    String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION);
    String indexPath = cmdline.getOptionValue(INDEX_OPTION);

    System.out.println(collectionPath + " " + indexPath);

    LOG.info("collection: " + collectionPath);
    LOG.info("index: " + indexPath);

    long startTime = System.currentTimeMillis();
    File file = new File(collectionPath);
    if (!file.exists()) {
        System.err.println("Error: " + file + " does not exist!");
        System.exit(-1);
    }

    final FieldType textOptions = new FieldType();
    textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
    textOptions.setStored(true);
    textOptions.setTokenized(true);
    if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) {
        textOptions.setStoreTermVectors(true);

    }

    final StatusStream stream = new JsonStatusCorpusReader(file);

    final Directory dir = new SimpleFSDirectory(Paths.get(cmdline.getOptionValue(INDEX_OPTION)));
    final IndexWriterConfig config = new IndexWriterConfig(ANALYZER);

    config.setOpenMode(IndexWriterConfig.OpenMode.APPEND);

    final IndexWriter writer = new IndexWriter(dir, config);
    System.out.print("Original # of docs " + writer.numDocs());
    int updateCount = 0;

    Runtime.getRuntime().addShutdownHook(new Thread() {
        public void run() {

            try {
                stream.close();
            } catch (IOException e1) {
                // TODO Auto-generated catch block
                e1.printStackTrace();
            }

            try {
                writer.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            try {
                dir.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
            System.out.println("Shutting down");

        }
    });
    int cnt = 0;
    Status status;
    try {
        while ((status = stream.next()) != null) {

            if (status.getPlace() != null) {

                //               Query q = NumericRangeQuery.newLongRange(TweetStreamReader.StatusField.ID.name, status.getId(),
                //                     status.getId(), true, true);
                //               System.out.print("Deleting docCount="+writer.numDocs());
                //               writer.deleteDocuments(q);
                //               writer.commit();
                //               System.out.print(" Deleted docCount="+writer.numDocs());

                Document doc = new Document();
                doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES));
                doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES));
                doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES));

                doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions));

                doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES));
                doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES));
                doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES));
                doc.add(new DoubleField(StatusField.LONGITUDE.name, status.getLongitude(), Store.YES));
                doc.add(new DoubleField(StatusField.LATITUDE.name, status.getlatitude(), Store.YES));
                doc.add(new StringField(StatusField.PLACE.name, status.getPlace(), Store.YES));
                long inReplyToStatusId = status.getInReplyToStatusId();
                if (inReplyToStatusId > 0) {
                    doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId,
                            Field.Store.YES));
                    doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(),
                            Field.Store.YES));
                }

                String lang = status.getLang();
                if (!lang.equals("unknown")) {
                    doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES));
                }

                long retweetStatusId = status.getRetweetedStatusId();
                if (retweetStatusId > 0) {
                    doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId,
                            Field.Store.YES));
                    doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(),
                            Field.Store.YES));
                    doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES));
                    if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) {
                        LOG.warn("Error parsing retweet fields of " + status.getId());
                    }
                }

                long id = status.getId();
                BytesRefBuilder brb = new BytesRefBuilder();
                NumericUtils.longToPrefixCodedBytes(id, 0, brb);
                Term term = new Term(StatusField.ID.name, brb.get());
                writer.updateDocument(term, doc);

                //               writer.addDocument(doc);

                updateCount += 1;

                if (updateCount % 10000 == 0) {

                    LOG.info(updateCount + " statuses updated");
                    writer.commit();
                    System.out.println("Updated docCount=" + writer.numDocs());
                }

            }

        }

        LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        writer.close();
        dir.close();
        stream.close();
    }
}

From source file:io.datalayer.lucene.index.IndexerMain.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is
 * given, recurses over files and directories found under the given
 * directory.//from  w  ww.  jav  a 2s  .  co m
 * 
 * NOTE: This method indexes one document per input file. This is slow. For
 * good throughput, put multiple documents into your input file(s). An
 * example of this is in the benchmark module, which can create "line doc"
 * files, one document per line, using the <a href=
 * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 * 
 * @param writer
 *            Writer to the index where the given file/dir info will be
 *            stored
 * @param file
 *            The file to index, or the directory to recurse into to find
 *            files to index
 * @throws IOException
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {

    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this
                // exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path". Use a
                // field that is indexed (i.e. searchable), but don't
                // tokenize
                // the field into separate words and don't index term
                // frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named
                // "modified".
                // Use a LongField that is indexed (i.e. efficiently
                // filterable with
                // NumericRangeFilter). This indexes to milli-second
                // resolution, which
                // is often too fine. You could instead create a number
                // based on
                // year/month/day/hour/minutes/seconds, down the resolution
                // you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents".
                // Specify a Reader,
                // so that the text of the file is tokenized and indexed,
                // but not stored.
                // Note that FileReader expects the file to be in UTF-8
                // encoding.
                // If that's not the case searching for special characters
                // will fail.
                /*
                 * doc.add(new TextField("contents", new BufferedReader(new
                 * InputStreamReader(fis, "UTF-8")), Field.Store.NO));
                 */
                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old
                    // document can be there):
                    LOGGER.info("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have
                    // been indexed) so
                    // we use updateDocument instead to replace the old one
                    // matching the exact
                    // path, if present:
                    LOGGER.info("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }

    }

}

From source file:io.datalayer.lucene.index.LuceneLifecycleTest.java

License:Apache License

/**
 * #A 2 docs in the index//from w  ww  .  ja v a 2 s . c  o m
 * 
 * #B Delete first document
 * 
 * #C 1 indexed document, 0 deleted documents
 */
@Test
public void testUpdate() throws IOException {

    assertEquals(1, getHitCount("city", "Amsterdam"));

    IndexWriter writer = getWriter();

    Document doc = new Document();
    doc.add(new StoredField("id", "1"));
    doc.add(new StoredField("country", "Netherlands"));
    doc.add(new Field("contents", "Den Haag has a lot of museums", AosFieldType.INDEXED_STOREDNOT_TERMVECTORS));
    doc.add(new Field("city", "Den Haag", AosFieldType.INDEXED_STOREDNOT_TERMVECTORS));

    writer.updateDocument(new Term("id", "1"), doc);
    writer.close();

    assertEquals(0, getHitCount("city", "Amsterdam"));
    assertEquals(1, getHitCount("city", "Haag"));
}

From source file:io.jpress.searcher.LuceneSearcher.java

License:LGPL

@Override
public void updateBean(SearcherBean bean) {
    try {/*  w  w w .  j  a v a  2s  . c om*/
        IndexWriter indexWriter = createIndexWriter();
        Term term = new Term("sid", bean.getSid());
        indexWriter.updateDocument(term, createDocument(bean));
        indexWriter.commit();
        indexWriter.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:io.seldon.semvec.CreateLuceneIndexFromDb.java

License:Apache License

private void saveDocument(long id, IndexSearcher reader, IndexWriter writer, BufferedWriter fileWriter)
        throws CorruptIndexException, IOException {
    String path;//  w w w.j a v  a2 s  . co m
    if (rawIds)
        path = "" + id;
    else
        path = sequentialIds ? toSV(this.seqId++) : toSV(id);
    if (reader != null && appendOnly) {
        Term docPathTerm = new Term(FIELD_PATH, path);
        TermQuery tq = new TermQuery(docPathTerm);
        int hits = reader.search(tq, 1).totalHits;
        if (hits > 0) {
            if (debug)
                System.out.println("Skipping existing doc with id " + id);
            return; // document exists so don't do anything
        }
    }
    String comments = null;
    String nlpComments = null;
    switch (this.extractionMethod) {
    case COMMENTS:
        comments = docStore.getComments(id);
        break;
    case ITEM_ATTR:
        if (attrIds != null && attrIds.length > 0)
            comments = docStore.getItemTextualById(id, new HashSet<Integer>(Arrays.asList(attrIds)));
        else if (attrNames != null && attrNames.length > 0)
            comments = docStore.getItemTextualByName(id, new HashSet<String>(Arrays.asList(attrNames)));
        else
            comments = docStore.getItemTextual(id);
        if (nlpAttrIds != null && nlpAttrIds.length > 0)
            nlpComments = docStore.getItemTextualById(id, new HashSet<Integer>(Arrays.asList(nlpAttrIds)));
        break;
    case USERS:
        comments = docStore.getUserItems(id, useItemIds);
        break;
    case USER_ACTIONS:
        comments = docStore.getUserActionAttrs(id, new HashSet<Integer>(Arrays.asList(attrIds)));
        break;
    case USER_DIM:
        comments = docStore.getDimTextual(id, new HashSet<Integer>(Arrays.asList(textAttrIds)), itemLimit);
        break;
    }

    if (comments != null) {
        if (this.removeHtml) {
            System.out.println("removing html");
            Source source = new Source(comments);
            comments = source.getTextExtractor().toString();

            if (nlpComments != null) {
                source = new Source(comments);
                nlpComments = source.getTextExtractor().toString();
            }
        }

        comments = comments.replaceAll("\\,|\\.|\\!|\\;|\\/", " ");
        if (transLiterate) {
            System.out.println("removing punctuation");
            comments = TransliteratorPeer.getPunctuationTransLiterator().transliterate(comments);
        }

        if (addEntities != null && nlpComments == null)
            comments = addEntities.process(comments);
        else if (addEntities != null && nlpComments != null) {
            nlpComments = addEntities.process(nlpComments);
            //System.out.println("NLP Comments:["+nlpComments+"]");
            //System.out.println("Existing comments:"+comments);
            comments = comments + " " + nlpComments;
        } else {
            comments = comments.replaceAll("\\,|\\.|\\!|\\;|\\/", " ");
            if (transLiterate) {
                System.out.println("removing punctuation");
                comments = TransliteratorPeer.getPunctuationTransLiterator().transliterate(comments);
            }
        }
        comments = comments.replaceAll("\\|", "");
        comments = comments.trim();
        String[] tokens = comments.split(" ");
        if (!"".equals(comments) && tokens.length >= minTokens) {
            if (debug)
                System.out.println("adding document for id " + id + " with text:[" + comments + "]");
            if (reader != null) {
                Term docPathTerm = new Term(FIELD_PATH, path);
                TermQuery tq = new TermQuery(docPathTerm);
                int hits = reader.search(tq, 1).totalHits;
                if (hits > 0) // doc exists in index (assumes a unique match...)
                    writer.updateDocument(docPathTerm, createDoc(path, comments));
                else
                    writer.addDocument(createDoc(path, comments));
            } else
                writer.addDocument(createDoc(path, comments));

            if (fileWriter != null) {
                if (yahooLDAfile != null) {
                    fileWriter.write("" + id);
                    fileWriter.write(" ");
                    fileWriter.write(path);
                    fileWriter.write(" ");
                    fileWriter.write(comments);
                    fileWriter.write("\n");
                } else {
                    fileWriter.write("" + id);
                    fileWriter.write(",");
                    fileWriter.write(comments);
                    fileWriter.write("\n");
                }
            }
        } else
            System.out.println("Skipping document with id " + id + " of token length " + tokens.length);
    }
}

From source file:l3.IndexFiles.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given, recurses over files and directories
 * found under the given directory./*from w w w .j  a va 2  s  .  c o  m*/
 * 
 * NOTE: This method indexes one document per input file. This is slow. For good throughput, put multiple documents
 * into your input file(s). An example of this is in the benchmark module, which can create "line doc" files, one
 * document per line, using the <a
 * href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 * 
 * @param writer
 *            Writer to the index where the given file/dir info will be stored
 * @param file
 *            The file to index, or the directory to recurse into to find files to index
 * @throws IOException
 *             If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                // make a new, empty document
                Document doc = new Document();

                // Add the path of the file as a field named "path". Use a
                // field that is indexed (i.e. searchable), but don't tokenize
                // the field into separate words and don't index term frequency
                // or positional information:
                Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                doc.add(pathField);

                // Add the last modified date of the file a field named "modified".
                // Use a LongField that is indexed (i.e. efficiently filterable with
                // NumericRangeFilter). This indexes to milli-second resolution, which
                // is often too fine. You could instead create a number based on
                // year/month/day/hour/minutes/seconds, down the resolution you require.
                // For example the long value 2011021714 would mean
                // February 17, 2011, 2-3 PM.
                doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));

                // Add the contents of the file to a field named "contents". Specify a Reader,
                // so that the text of the file is tokenized and indexed, but not stored.
                // Note that FileReader expects the file to be in UTF-8 encoding.
                // If that's not the case searching for special characters will fail.
                doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));

                if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                    // New index, so we just add the document (no old document can be there):
                    System.out.println("adding " + file);
                    writer.addDocument(doc);
                } else {
                    // Existing index (an old copy of this document may have been indexed) so
                    // we use updateDocument instead to replace the old one matching the exact
                    // path, if present:
                    System.out.println("updating " + file);
                    writer.updateDocument(new Term("path", file.getPath()), doc);
                }

            } finally {
                fis.close();
            }
        }
    }
}

From source file:lia.chapter2.IndexingTest.java

License:Apache License

@Test
public void testUpdate() throws IOException {

    assertEquals(1, getHitCount("city", "Amsterdam"));

    IndexWriter writer = getWriter();

    Document doc = new Document(); //A
    doc.add(new Field("id", "1", StringField.TYPE_STORED)); //A
    doc.add(new Field("country", "Netherlands", StringField.TYPE_STORED)); //A
    doc.add(new Field("contents", "Den Haag has a lot of museums", TextField.TYPE_STORED)); //A
    doc.add(new Field("city", "Den Haag", TextField.TYPE_STORED)); //A

    writer.updateDocument(new Term("id", "1"), //B
            doc); //B
    writer.close();// ww w .  java2  s.c  o m

    assertEquals(0, getHitCount("city", "Amsterdam"));//C
    assertEquals(1, getHitCount("city", "Haag")); //D
}