Example usage for org.apache.lucene.index IndexReader maxDoc

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader maxDoc.

Prototype

public abstract int maxDoc();

Source Link

Document

Returns one greater than the largest possible document number.

Usage

From source file:org.hippoecm.repository.query.lucene.util.CachingMultiReaderQueryFilter.java

License:Apache License

@Override
public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {
    if (reader instanceof MultiIndexReader) {
        MultiIndexReader multiIndexReader = (MultiIndexReader) reader;

        IndexReader[] indexReaders = multiIndexReader.getIndexReaders();
        DocIdSet[] docIdSets = new DocIdSet[indexReaders.length];
        int[] maxDocs = new int[indexReaders.length];
        for (int i = 0; i < indexReaders.length; i++) {
            IndexReader subReader = indexReaders[i];
            docIdSets[i] = getIndexReaderDocIdSet(subReader, subReader);
            maxDocs[i] = subReader.maxDoc();
        }/*from  w  w w  . ja  v  a2 s .  co  m*/

        return new MultiDocIdSet(docIdSets, maxDocs);
    }
    log.warn(
            "MultiIndexReader was expected but not found. Do not dissect the reader but use it as one instead");

    return getIndexReaderDocIdSet(reader, reader);
}

From source file:org.hippoecm.repository.query.lucene.util.CachingMultiReaderQueryFilter.java

License:Apache License

private OpenBitSet createDocIdSet(IndexReader reader) throws IOException {
    final OpenBitSet bits = new OpenBitSet(reader.maxDoc());

    long start = System.currentTimeMillis();
    new IndexSearcher(reader).search(query, new AbstractHitCollector() {

        @Override/*from  w w w . j a  v  a  2s  . c  om*/
        public final void collect(int doc, float score) {
            bits.set(doc); // set bit for hit
        }
    });
    log.info("Creating CachingMultiReaderQueryFilter doc id set took {} ms.",
            String.valueOf(System.currentTimeMillis() - start));
    return bits;
}

From source file:org.infoglue.cms.controllers.kernel.impl.simple.LuceneController.java

License:Open Source License

public Map getIndexInformation() {
    Map info = new HashMap();

    try {/*from   w  w  w  .j a v  a2  s .  c o  m*/
        String index = CmsPropertyHandler.getContextRootPath() + File.separator + "lucene" + File.separator
                + "index";

        boolean indexExists = IndexReader.indexExists(new File(index));
        if (!indexExists) {
            try {
                File INDEX_DIR = new File(index);
                IndexWriter writer = new IndexWriter(INDEX_DIR, new StandardAnalyzer());
                logger.info("Indexing to directory '" + INDEX_DIR + "'...");
                writer.deleteDocuments(new Term("initializer", "true"));
                logger.info("Optimizing...");
                writer.optimize();
                writer.close();
            } catch (Exception e) {
                logger.error("Error creating index:" + e.getMessage(), e);
            }
        }

        IndexReader reader = IndexReader.open(index);
        int maxDoc = reader.maxDoc();
        int numDoc = reader.numDocs();
        long lastModified = IndexReader.lastModified(index);

        info.put("maxDoc", new Integer(maxDoc));
        info.put("numDoc", new Integer(numDoc));
        info.put("lastModified", new Date(lastModified));

        reader.close();
    } catch (Exception e) {
        logger.error("Error creating index:" + e.getMessage(), e);
    }

    return info;
}

From source file:org.jab.docsearch.Index.java

License:Open Source License

/**
 * The location of a URL in an index; used in the algorithm for updating an
 * index.//from   w w w.ja v a  2  s.  c  o m
 *
 * @return the location of the SpiderUrl in a web oriented DocSearcherIndex,
 *         or -1 if the URL is not in the index
 */
public int spiderIndexNum(int lastFound, String fileName, IndexReader ir) {
    int returnInt = -1;
    synchronized (this) {
        if (lastFound == -1)
            lastFound = 0;
        try {
            Document doc;
            String compareName = "";
            int numDocs = ir.maxDoc();
            for (int i = lastFound; i < numDocs; i++) {
                if (!ir.isDeleted(i)) {
                    doc = ir.document(i);
                    if (doc != null) {
                        compareName = doc.get(FIELD_URL);
                        if (compareName.equals(fileName)) {
                            returnInt = i;
                            break;
                        }
                    }
                }
            }
            if (returnInt == -1) {
                for (int i = lastFound; i > 0; i--) {
                    if (!ir.isDeleted(i)) {
                        doc = ir.document(i);
                        if (doc != null) {
                            compareName = doc.get(FIELD_URL);
                            // System.out.println("Comparing "+compareName+"
                            // to "+fileName);
                            if (compareName.equals(fileName)) {
                                // System.out.println("MATCH FOUND AT "+i);
                                returnInt = i;
                                break;
                            }
                        }
                    }
                }
            }

            if (returnInt == -1)
                ds.setStatus("File " + fileName + " not found in index!");
        } catch (Exception e) {
            logger.error("spiderIndexNum() failed", e);
            ds.setStatus("Error determining if doc is already in index!");
        }
        // finally {
        return returnInt;
        // }
    }
}

From source file:org.jab.docsearch.Index.java

License:Open Source License

/**
 * Location of a file in a DocSearcher index; used by update algoritm to
 * update an index.//from  w  ww . j  a  v  a  2 s.  c o m
 *
 * @return location of the document in the DocSearcherIndex or -1 if it is
 *         not in there
 */
public int indexNum(int lastFound, String fileName, IndexReader ir) {
    int returnInt = -1;
    synchronized (this) {
        if (lastFound == -1)
            lastFound = 0;
        try {
            Document doc;
            String compareName = "";
            int numDocs = ir.maxDoc();
            for (int i = lastFound; i < numDocs; i++) {
                if (!ir.isDeleted(i)) {
                    doc = ir.document(i);
                    if (doc != null) {
                        compareName = doc.get(FIELD_PATH);
                        if (compareName.equals(fileName)) {
                            returnInt = i;
                            break;
                        }
                    }
                }
            }
            if (returnInt == -1) {
                for (int i = lastFound; i > 0; i--) {
                    if (!ir.isDeleted(i)) {
                        doc = ir.document(i);
                        if (doc != null) {
                            compareName = doc.get(FIELD_PATH);
                            // System.out.println("Comparing "+compareName+"
                            // to "+fileName);
                            if (compareName.equals(fileName)) {
                                // System.out.println("MATCH FOUND AT "+i);
                                returnInt = i;
                                break;
                            }
                        }
                    }
                }
            }

            if (returnInt == -1)
                ds.setStatus("File " + fileName + " not found in index!");
        } catch (Exception e) {
            logger.error("indexNum() failed", e);
            ds.setStatus("Error determining if doc is already in index!");
        }
        return returnInt;
    }
}

From source file:org.jab.docsearch.Index.java

License:Open Source License

/**
 * Updates a DocSearcherIndex//w w  w.j a v  a 2  s.  c  om
 *
 * @param di  DocSearcherIndex
 */
public void updateIndex(final DocSearcherIndex di) {
    notesBuf = new StringBuffer();
    newItsBuf = new StringBuffer();
    modItsItsBuf = new StringBuffer();
    delItsItsBuf = new StringBuffer();
    totalChanges = 0;
    long curFileSizeBytes = 0;
    int errNum = 0;
    StringBuffer noRobotsBuf = new StringBuffer();
    int numNoIndex = 0;
    // int numErrors = 0;
    StringBuffer failedBuf = new StringBuffer();
    int addedSuccessFully = 0;
    failedBuf.append("\n");
    synchronized (this) {
        if (di.isCdrom()) {
            // do nothing
        } else if (di.getIsSpider()) {
            doSpiderUpdate(di);
        } else if (!di.getPath().toLowerCase().endsWith(".zip")) { // not a zip
                                                                   // archive
            int numUpdates = 0;
            int numRemovals = 0;
            int numNew = 0;
            try {
                IndexReader ir = IndexReader.open(di.getIndexPath());
                int numDocs = ir.maxDoc();
                ds.setStatus(
                        "There are " + numDocs + " docs in index " + di.getName() + "(" + di.getPath() + ")");
                addHeader(di.getName());
                //ArrayList<String> allDocsInIndexx = new ArrayList<String>(); // indexed files
                // ArrayList allDocsInFolder = new ArrayList(); // current files
                // ArrayList newDocsToAdd = new ArrayList(); // files to be added that are new
                ds.setIsWorking(true);
                ds.setProgressMax(numDocs);
                ds.setCurProgressMSG("Updating Modified Files...");
                setInsertMode(1); // note we are looking for modified files

                logger.info("updateIndex() updating " + numDocs + " document from index");

                for (int i = 0; i < numDocs; i++) {
                    if (!ds.getIsWorking()) {
                        break;
                    }
                    if (!ir.isDeleted(i)) {
                        ds.setCurProgress(i);
                        Document doc = ir.document(i);
                        if (doc != null) {
                            String curFiName = doc.get(FIELD_PATH);
                            String curFiModDate = doc.get(FIELD_MODDATE);
                            File testFi = new File(curFiName);

                            // check file not found
                            if (testFi.exists()) {
                                //allDocsInIndex.add(curFiName);
                                String realFileModDate = DateTimeUtils
                                        .getTimeStringForIndex(testFi.lastModified());

                                // check file is changed
                                if (!realFileModDate.equals(curFiModDate)) {
                                    logger.info("updateIndex() updating " + curFiName + " in index");

                                    numUpdates++;
                                    // remove old document
                                    ir.deleteDocument(i);
                                    ir.close();
                                    // open writer to add document once again
                                    ds.setStatus("Reindexing: " + curFiName);
                                    IndexWriter iw = new IndexWriter(di.getIndexPath(), new StandardAnalyzer(),
                                            false);
                                    // next line should remove too many files open errors
                                    // iw.setUseCompoundFile(true);
                                    addedSuccessFully = addDocToIndex(curFiName, iw, di, di.isCdrom(), null);
                                    iw.close();
                                    // reopen
                                    ir = IndexReader.open(di.getIndexPath());
                                    switch (addedSuccessFully) {
                                    case 1: // error
                                        errNum++;
                                        if (errNum < 8) {
                                            failedBuf.append("\n");
                                            failedBuf.append(curFiName);
                                        }
                                        ds.setStatus(DocSearch.dsErrIdxgFi + " " + curFiName);
                                        break;
                                    case 2: // meta robots = noindex
                                        numNoIndex++;
                                        if (numNoIndex < 8) {
                                            noRobotsBuf.append("\n");
                                            noRobotsBuf.append(curFiName);
                                        }
                                        ds.setStatus("No Indexing Meta Requirement found in : " + curFiName);
                                        break;
                                    default: // OK
                                        numUpdates++;
                                        ds.setStatus("Indexing " + curFiName + " complete.");
                                        break;
                                    } // end of switch
                                }
                            } else {
                                ds.setStatus("Deleting: " + curFiName);
                                logger.info("updateIndex() remove " + curFiName + " from index");
                                ir.deleteDocument(i);
                                addDelNote(doc);
                                numRemovals++;
                            }
                        }
                    }
                    // end for not deleted
                    // else System.out.println("Document was null or
                    // deleted:"+i);
                }
                // end for getting gocs
                ds.resetProgress();

                // now add the new files
                setInsertMode(0);
                ArrayList<String> folderList = new ArrayList<String>();
                folderList.add(di.getPath());
                int startSubNum = Utils.countSlash(di.getPath());
                int maxSubNum = startSubNum + di.getDepth();
                int lastItemNo = 0;
                int curItemNo = 0;
                int lastFound = 0;
                do {
                    // create our folder file
                    if (!ds.getIsWorking()) {
                        break;
                    }
                    String curFolderString = folderList.get(curItemNo);
                    logger.debug("updateIndex() folder=" + curFolderString);

                    File curFolderFile = new File(curFolderString);
                    int curSubNum = Utils.countSlash(curFolderString);
                    // handle any subfolders --> add them to our folderlist
                    String[] foldersString = curFolderFile.list(DocSearch.ff);
                    int numFolders = foldersString.length;
                    for (int i = 0; i < numFolders; i++) {
                        // add them to our folderlist
                        String curFold = curFolderString + pathSep + foldersString[i] + pathSep;
                        curFold = Utils.replaceAll(pathSep + pathSep, curFold, pathSep);
                        folderList.add(curFold);
                        lastItemNo++;
                        // debug output
                    }
                    // end for having more than 0 folder
                    // add our files
                    String[] filesString = curFolderFile.list(DocSearch.wf);
                    int numFiles = filesString.length;
                    ds.setProgressMax(numDocs);
                    ds.setCurProgressMSG("Updating new Files...");

                    for (int i = 0; i < numFiles; i++) {
                        // add them to our folderlist
                        if (!ds.getIsWorking()) {
                            break;
                        }
                        String curFi = curFolderString + pathSep + filesString[i];
                        curFi = Utils.replaceAll(pathSep + pathSep, curFi, pathSep);
                        curFileSizeBytes = FileUtils.getFileSize(curFi);
                        if (curFileSizeBytes > ds.getMaxFileSize()) {
                            logger.debug("updateIndex() skipping " + curFi + " because is to big");
                            ds.setStatus(I18n.getString("skipping_file_too_big") + " (" + curFileSizeBytes
                                    + ") " + filesString[i]);
                        } else {
                            lastFound = indexNum(lastFound, curFi, ir);
                            if (lastFound == -1) {
                                logger.info("updateIndex() adding " + curFi + " to index");

                                ir.close();
                                // open writer to add document once again
                                IndexWriter iw = new IndexWriter(di.getIndexPath(), new StandardAnalyzer(),
                                        false);
                                addedSuccessFully = addDocToIndex(curFi, iw, di, di.isCdrom(), null);
                                switch (addedSuccessFully) {
                                case 1: // error
                                    errNum++;
                                    if (errNum < 8) {
                                        failedBuf.append("\n");
                                        failedBuf.append(curFi);
                                    }
                                    ds.setStatus(DocSearch.dsErrIdxg + " " + curFi);
                                    break;
                                case 2: // meta robots = noindex
                                    numNoIndex++;
                                    if (numNoIndex < 8) {
                                        noRobotsBuf.append("\n");
                                        noRobotsBuf.append(curFi);
                                    }
                                    ds.setStatus("Document Exlusion (robots = NOINDEX) : " + curFi);
                                    break;
                                default: // OK
                                    numNew++;
                                    ds.setStatus("New Document Added : " + curFi);
                                    break;
                                } // end of switch
                                iw.close();
                                // reopen
                                ir = IndexReader.open(di.getIndexPath());
                            } // end for lastfound not -1
                        } // end for file size not too big
                        ds.setCurProgress(i);
                        ds.resetProgress();
                    }
                    // end for having more than 0 folder
                    // increment our curItem
                    folderList.set(curItemNo, null); // remove memory overhead as you go!
                    curItemNo++;
                    if (curSubNum >= maxSubNum) {
                        break;
                    }
                    if (!ds.getIsWorking()) {
                        break;
                    }
                } while (curItemNo <= lastItemNo);
                //
                ir.close(); // always close!
                StringBuffer updateMSGBuf = new StringBuffer();
                updateMSGBuf.append('\n');
                updateMSGBuf.append(numRemovals).append(" files were removed from index.\n");
                updateMSGBuf.append(numUpdates).append(" files were reindexed.\n");
                updateMSGBuf.append(numNew).append(" new files were added to the index.\n");
                //
                totalChanges = numRemovals + numUpdates + numNew;
                // all our stuff to the notesBuf
                addNote(updateMSGBuf.toString(), "", true);
                // add our new and modified files
                if (numNew > 0) {
                    addNote(I18n.getString("new_files"), "", true);
                    notesBuf.append(newItsBuf);
                }
                //
                if (numUpdates > 0) {
                    addNote(I18n.getString("updated_files"), "", true);
                    notesBuf.append(modItsItsBuf);
                }
                //
                //
                if (numRemovals > 0) {
                    addNote(I18n.getString("deleted_files"), "", true);
                    notesBuf.append(delItsItsBuf);
                }
                //

                addFooter();
                if (errNum == 0) {
                    updateMSGBuf.append("No errors were encountered during this process.");
                    if (numNoIndex > 0) {
                        updateMSGBuf.append("\n\n").append(numNoIndex).append(
                                " files were not indexed due to meta data constraints (robots = NOINDEX), including:\n");
                        updateMSGBuf.append(noRobotsBuf);
                    }
                    ds.showMessage("Update of index " + di.getName() + " Completed", updateMSGBuf.toString());
                } else {
                    updateMSGBuf.append(errNum).append(
                            " errors were encountered during this process.\nThe following files had problems being indexed or re-indexed:\n")
                            .append(failedBuf);
                    if (numNoIndex > 0) {
                        updateMSGBuf.append("\n\n").append(numNoIndex).append(
                                " files were not indexed due to meta data constraints (robots = NOINDEX), including:\n");
                        updateMSGBuf.append(noRobotsBuf);
                    }

                    ds.showMessage("Errors during Update of index " + di.getName(), updateMSGBuf.toString());
                }
            }
            // end of try
            catch (Exception e) {
                logger.error("updateIndex() error during update index " + di.getName(), e);
                ds.showMessage("Error updating index " + di.getName(), e.toString());
            }

            addFooter();
            di.setLastIndexed(DateTimeUtils.getToday());
            ds.setStatus("Update of index " + di.getName() + " completed.");
            ds.setIsWorking(false);
        } else {
            ds.doZipArchiveUpdate(di);
        }
    }
}

From source file:org.jab.docsearch.spider.LinkFinder.java

License:Open Source License

/**
 * Method update/*from w  w  w .java 2s. co m*/
 *
 * @throws IOException
 */
public void update() throws IOException {
    numDeletes = 0;
    numChanges = 0;
    numNew = 0;
    numFails = 0;
    numUnChanged = 0;
    numMetaNoIdx = 0;

    IndexReader ir = IndexReader.open(dsi.getIndexPath());
    int maxNumDocs = ir.maxDoc();
    int maxTotal = maxNumDocs + maxNumDocs / 10;
    int curDocNum = 0;
    if (ds != null) {
        ds.setStatus(DocSearch.dsTtlDxInIdx + " " + maxNumDocs);
        ds.setIsWorking(true);
        ds.setProgressMax(maxTotal * 2);
        ds.setCurProgressMSG("Spidering Files...");
    }

    // assign index location to urls currently in the index
    int lastFound = 0;
    for (SpiderUrl spy : links) {
        curDocNum++;

        if (ds != null) {
            ds.setCurProgress(curDocNum);
            if (!ds.getIsWorking()) {
                break;
            }
        }

        String curFi = spy.getUrl();
        lastFound = ds.idx.spiderIndexNum(lastFound, curFi, ir);
        spy.setIndexLocation(lastFound);

        if (lastFound == -1) {
            logger.debug("update() " + curFi + " currently is not in the index");
        }
    }

    // now iterate over all the spider urls
    int curSpiderNum = getNextUrlNo();
    int totalSpidered = 0;
    while (curSpiderNum != -1) {
        curDocNum++;

        if (ds != null) {
            ds.setCurProgress(curDocNum);
            if (!ds.getIsWorking()) {
                break;
            }
        }

        SpiderUrl curSpider = getSpiderUrl(curSpiderNum);
        int curNumLinksFound = getNumLinksFound();
        int curIdxNum = curSpider.getIndexLocation();
        // TODO is this getsize realy needed, than the url ist in index?
        long curUrlSize = netUtils.getURLSize(curSpider.getUrl());
        String shortUrl = Utils.concatEnd(curSpider.getUrl(), 33);
        String dnldTmpName = getDownloadFileName(curSpider.getContentType(), curSpider.getUrl().toLowerCase());
        String downloadFile = FileUtils.addFolder(downloadFileDir, dnldTmpName);

        // document is to big
        if (curUrlSize > maxFileSizeToGet) {
            logger.debug("update() '" + shortUrl + "' is to big");
            setStatus(I18n.getString("skipping_file_too_big") + " (" + curUrlSize + " > " + maxFileSizeToGet
                    + ") " + shortUrl);
            curSpider.setSize(curUrlSize);
        }
        // document is in index
        else if (curIdxNum != -1) {
            logger.debug("update() '" + shortUrl + "' is in index");
            setStatus(DocSearch.dsCkgFoUpdtsToDoc + " " + shortUrl + " (" + totalSpidered + " / "
                    + curNumLinksFound + ")");

            int curSpiderStatus = netUtils.getURLStatus(curSpider, downloadFile);
            switch (curSpiderStatus) {
            case -1: // broken url
                logger.debug("update() '" + shortUrl + "' is broken");
                setStatus(DocSearch.dsBknLink + " " + shortUrl);
                curSpider.setIsDeadLink(true);
                // remove from index
                ir.deleteDocument(curIdxNum);
                numDeletes++;
                break;
            case 0: // same
                logger.debug("update() '" + shortUrl + "' no changes");
                setStatus(DocSearch.lnkNoChanges + " " + shortUrl);
                numUnChanged++;
                totalSpidered++;
                break;
            case 1: // changed
                logger.debug("update() '" + shortUrl + "' is changed");
                setStatus(DocSearch.dsReIdxgLnk + " " + shortUrl);
                ir.deleteDocument(curIdxNum);
                ir.close();
                iw = new IndexWriter(dsi.getIndexPath(), new StandardAnalyzer(), false);
                // iw.setUseCompoundFile(true);
                int curAddedSuccess = ds.idx.addDocToIndex(downloadFile, iw, dsi, false, curSpider);
                iw.close();
                ir = IndexReader.open(dsi.getIndexPath());
                if (curAddedSuccess == 0) {
                    numChanges++;
                    totalSpidered++;
                } else if (curAddedSuccess == 2) {
                    numMetaNoIdx++;
                } else if (curAddedSuccess == 1) {
                    logger.warn("update() indexing failed " + shortUrl);
                    numFails++;
                }

                // get links from downloaded file
                if (isHtml(curSpider.getUrl())) {
                    checkFileForLinks(downloadFile, curSpider.getUrl());
                }
                break;
            }
        }
        // document is not in index
        else {
            logger.debug("update() '" + shortUrl + "' is not in index");
            setStatus(DocSearch.dsSpiderNewUrl + " " + shortUrl + " (" + totalSpidered + " / "
                    + curNumLinksFound + ")");

            boolean downloadOk = netUtils.downloadURLToFile(curSpider, downloadFile);
            if (downloadOk) {
                iw = new IndexWriter(dsi.getIndexPath(), new StandardAnalyzer(), false);
                // iw.setUseCompoundFile(true);
                int curAddedSuccess = ds.idx.addDocToIndex(downloadFile, iw, dsi, false, curSpider);
                iw.close();
                ir.close();
                ir = IndexReader.open(dsi.getIndexPath());
                if (curAddedSuccess == 0) {
                    numNew++;
                    totalSpidered++;
                } else if (curAddedSuccess == 2) {
                    numMetaNoIdx++;
                } else if (curAddedSuccess == 1) {
                    logger.warn("update() indexing failed " + shortUrl);
                    numFails++;
                }
                if (isHtml(curSpider.getUrl())) {
                    checkFileForLinks(downloadFile, curSpider.getUrl());
                }
            } else {
                setStatus(DocSearch.dsBknLink + " " + shortUrl);
                curSpider.setIsDeadLink(true);
            }
        }

        // last things to do
        curSpider.setSpidered(true);
        curSpiderNum = getNextUrlNo();
        if (curSpiderNum == -1) {
            break;
        }
        if (totalSpidered > maxTotal) {
            break;
        }

        // delete temp file
        if (!FileUtils.deleteFile(downloadFile)) {
            logger.warn("update() can't delete file '" + downloadFile + "'");
        }
    }

    setStatus(DocSearch.dsSpdrUpdteComp + " " + dsi.getName());
    saveAllLinks();

    // update the date of the index
    dsi.setLastIndexed(DateTimeUtils.getToday());
    ir.close();
    ds.resetProgress();
}

From source file:org.jab.docsearch.utils.MetaReport.java

License:Open Source License

/**
 * doMetaDataReport//from ww  w .  ja  va 2  s .  c o m
 *
 * @param di
 * @param listAll
 * @param pathRequired
 * @param pathText
 * @param authRequired
 * @param authText
 * @param reportFile
 * @param maxDocs
 * @param useDaysOld
 * @param maxDays
 */
private void doMetaDataReport(DocSearcherIndex di, boolean listAll, boolean pathRequired, String pathText,
        boolean authRequired, String authText, String reportFile, int maxDocs, boolean useDaysOld,
        int maxDays) {
    try {
        // intialize our metrics
        int numBadDocs = 0;
        int totalDocs = 0;
        int numGoodDocs = 0;
        String lineSep = Utils.LINE_SEPARATOR;
        StringBuffer documentBuffer = new StringBuffer();
        StringBuffer metaDataReport = new StringBuffer();

        // initialize the reader
        IndexReader ir = IndexReader.open(di.getIndexPath());
        int numDocs = ir.maxDoc();
        ds.setStatus(numDocs + " " + Messages.getString("DocSearch.numDox") + " " + di.getName());

        // write the start of the table
        documentBuffer.append("<table style=\"empty-cells:show\" border=\"1\">").append(lineSep);
        documentBuffer.append("<tr>").append(lineSep);
        int numHdrs = allFields.length;
        for (int z = 0; z < numHdrs; z++) {
            documentBuffer.append("<th valign=\"top\">");
            documentBuffer.append(allFields[z]);
            documentBuffer.append("</th>").append(lineSep);
        }
        documentBuffer.append("</tr>").append(lineSep);
        for (int i = 0; i < numDocs; i++) {
            if (!ir.isDeleted(i)) {
                Document doc = ir.document(i);
                if (doc != null) {
                    boolean curSkip = false;

                    // put in the docs values
                    String path;
                    if (di.getIsWeb()) {
                        path = doc.get(Index.FIELD_URL);
                    } else {
                        path = doc.get(Index.FIELD_PATH);
                    }

                    ds.setStatus("Examining document: " + path);
                    String type = doc.get(Index.FIELD_TYPE);
                    String author = doc.get(Index.FIELD_AUTHOR);
                    String summary = doc.get(Index.FIELD_SUMMARY);
                    String title = doc.get(Index.FIELD_TITLE);
                    String size = doc.get(Index.FIELD_SIZE);
                    String keywords = doc.get(Index.FIELD_KEYWORDS);
                    String date = DateTimeUtils.getDateParsedFromIndex(doc.get(Index.FIELD_MODDATE));

                    // determine if we even need to examine it
                    if (pathRequired) {
                        if (path.indexOf(pathText) == -1) {
                            curSkip = true;
                        }
                    }

                    if (authRequired) {
                        if (author.indexOf(authText) == -1) {
                            curSkip = true;
                        }
                    }

                    // determine if its bad of good
                    if (!curSkip) {
                        totalDocs++;
                        boolean isGood = goodMetaData(title, summary, author, date, keywords, type, useDaysOld,
                                maxDays);

                        // write to our file
                        if (!isGood || listAll) {
                            documentBuffer.append("<tr>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\">"); // path
                            documentBuffer.append(path);
                            documentBuffer.append("</td>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\"><small>");
                            documentBuffer.append(Utils.convertTextToHTML(title));
                            documentBuffer.append("</small></td>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\">");
                            documentBuffer.append(author);
                            documentBuffer.append("</td>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\">");
                            documentBuffer.append(date);
                            documentBuffer.append("</td>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\"><small>");
                            documentBuffer.append(Utils.convertTextToHTML(summary));
                            documentBuffer.append("</small></td>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\"><small>");
                            documentBuffer.append(keywords);
                            documentBuffer.append("</small></td>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\">");
                            documentBuffer.append(size);
                            documentBuffer.append("</td>").append(lineSep);
                            documentBuffer.append("<td valign=\"top\">");
                            documentBuffer.append(type);
                            documentBuffer.append("</td>").append(lineSep);
                            documentBuffer.append("</tr>").append(lineSep);
                        }

                        if (isGood) {
                            ds.setStatus(path + " " + dsNotMsgMeta);
                            numGoodDocs++;
                        } else {
                            ds.setStatus(path + " " + dsMsgMeta);
                            numBadDocs++;
                        }
                    } else {
                        ds.setStatus(dsSkip + " " + path);
                    }
                }
            }

            if (i > maxDocs) {
                break;
            }
        }
        documentBuffer.append("</table>").append(lineSep);

        int percentGood = 0;
        if (totalDocs > 0) {
            percentGood = (numGoodDocs * 100) / totalDocs;
        }

        ds.setStatus("%  " + dsGood + ": " + percentGood + " (" + numGoodDocs + " / " + totalDocs + ", "
                + numBadDocs + " " + dsBad + ").");

        // write complete report with summary
        metaDataReport.append("<html>").append(lineSep);
        metaDataReport.append("<head>").append(lineSep);
        metaDataReport.append("<title>").append(dsMetaRpt).append(' ').append(di.getName()).append("</title>")
                .append(lineSep);
        metaDataReport.append(
                "<meta name=\"description\" content=\"lists documents with poorly searchable meta data\">")
                .append(lineSep);
        metaDataReport.append("<meta name=\"author\" content=\"DocSearcher\">").append(lineSep);
        metaDataReport.append("</head>").append(lineSep);
        metaDataReport.append("<body>").append(lineSep);
        metaDataReport.append("<h1>").append(dsMetaRpt).append(' ').append(di.getName()).append("</h1>")
                .append(lineSep);
        metaDataReport.append("<p align=\"left\"><b>");
        metaDataReport.append(numBadDocs);
        metaDataReport.append("</b> ");
        metaDataReport.append(dsPoorMeta);
        metaDataReport.append(" <br> &amp; <b>");
        metaDataReport.append(numGoodDocs);
        metaDataReport.append("</b> ");
        metaDataReport.append(dsGoodMetaNum);
        metaDataReport.append(".</p>").append(lineSep);
        metaDataReport.append("<p align=\"left\">");
        metaDataReport.append(dsMetaOO);
        metaDataReport.append(" <b>");
        metaDataReport.append(percentGood + "</b> % . </p>");
        metaDataReport.append("<p align=\"left\">");
        metaDataReport.append(dsTblDsc);
        metaDataReport.append(".</p>").append(lineSep);

        // add document buffer
        metaDataReport.append(documentBuffer);

        metaDataReport.append("</body>").append(lineSep);
        metaDataReport.append("</html>").append(lineSep);

        ds.curPage = Messages.getString("DocSearch.report");

        boolean fileSaved = FileUtils.saveFile(reportFile, metaDataReport);
        if (fileSaved) {
            ds.doExternal(reportFile);
        }
    } catch (IOException ioe) {
        logger.fatal("doMetaDataReport() create meta data report failed", ioe);
        ds.setStatus(Messages.getString("DocSearch.statusMetaDataError") + di.getName() + ":" + ioe.toString());
    }
}

From source file:org.lexevs.dao.index.lucenesupport.BaseLuceneIndexTemplate.java

License:Open Source License

public int getMaxDoc() {
    return this.doInIndexReader(new IndexReaderCallback<Integer>() {

        @Override/* w  w  w  . j  ava2s.co  m*/
        public Integer doInIndexReader(IndexReader indexReader) throws Exception {
            return indexReader.maxDoc();
        }
    });
}

From source file:org.metaservice.core.maven.MavenIndexCrawler.java

License:Apache License

public void perform() throws IOException, ComponentLookupException, InvalidVersionSpecificationException {
    // Files where local cache is (if any) and Lucene Index should be located
    File centralLocalCache = new File("target/central-cache");
    File centralIndexDir = new File("target/central-index");

    // Creators we want to use (search for fields it defines)
    List<IndexCreator> indexers = new ArrayList<>();
    indexers.add(plexusContainer.lookup(IndexCreator.class, "min"));
    indexers.add(plexusContainer.lookup(IndexCreator.class, "jarContent"));
    indexers.add(plexusContainer.lookup(IndexCreator.class, "maven-plugin"));

    // Create context for central repository index
    centralContext = indexer.createIndexingContext("central-context", "central", centralLocalCache,
            centralIndexDir, "http://repo1.maven.org/maven2", null, true, true, indexers);

    // Update the index (incremental update will happen if this is not 1st run and files are not deleted)
    // This whole block below should not be executed on every app start, but rather controlled by some configuration
    // since this block will always emit at least one HTTP GET. Central indexes are updated once a week, but
    // other index sources might have different index publishing frequency.
    // Preferred frequency is once a week.
    if (true) {/* www  . j av  a2  s  .c o  m*/
        System.out.println("Updating Index...");
        System.out.println("This might take a while on first run, so please be patient!");
        // Create ResourceFetcher implementation to be used with IndexUpdateRequest
        // Here, we use Wagon based one as shorthand, but all we need is a ResourceFetcher implementation
        TransferListener listener = new AbstractTransferListener() {
            public void transferStarted(TransferEvent transferEvent) {
                System.out.print(" Downloading " + transferEvent.getResource().getName());
            }

            public void transferProgress(TransferEvent transferEvent, byte[] buffer, int length) {
            }

            public void transferCompleted(TransferEvent transferEvent) {
                System.out.println(" - Done");
            }
        };
        ResourceFetcher resourceFetcher = new WagonHelper.WagonFetcher(httpWagon, listener, null, null);

        Date centralContextCurrentTimestamp = centralContext.getTimestamp();
        IndexUpdateRequest updateRequest = new IndexUpdateRequest(centralContext, resourceFetcher);
        IndexUpdateResult updateResult = indexUpdater.fetchAndUpdateIndex(updateRequest);
        if (updateResult.isFullUpdate()) {
            System.out.println("Full update happened!");
        } else if (updateResult.getTimestamp().equals(centralContextCurrentTimestamp)) {
            System.out.println("No update needed, index is up to date!");
        } else {
            System.out.println("Incremental update happened, change covered " + centralContextCurrentTimestamp
                    + " - " + updateResult.getTimestamp() + " period.");
        }

        System.out.println();
    }

    System.out.println();
    System.out.println("Using index");
    System.out.println("===========");
    System.out.println();

    // ====
    // Case:
    // dump all the GAVs
    // NOTE: will not actually execute do this below, is too long to do (Central is HUGE), but is here as code
    // example

    int j = 0;
    if (true) {

        final IndexSearcher searcher = centralContext.acquireIndexSearcher();
        try {
            final IndexReader ir = searcher.getIndexReader();
            for (int i = 0; i < ir.maxDoc(); i++) {
                if (!ir.isDeleted(i)) {
                    j++;
                    final Document doc = ir.document(i);
                    final ArtifactInfo ai = IndexUtils.constructArtifactInfo(doc, centralContext);
                    if (ai != null && "pom".equals(ai.fextension))
                        System.out.println(ai.groupId + ":" + ai.artifactId + ":" + ai.version + ":"
                                + ai.classifier + " (sha1=" + ai.sha1 + ")");

                }
            }
        } finally {
            centralContext.releaseIndexSearcher(searcher);
        }
    }
    System.err.println(j);
    if (j > 0)
        return;

    // ====
    // Case:
    // Search for all GAVs with known G and A and having version greater than V

    final GenericVersionScheme versionScheme = new GenericVersionScheme();
    final String versionString = "1.5.0";
    final Version version = versionScheme.parseVersion(versionString);

    // construct the query for known GA
    final Query groupIdQ = indexer.constructQuery(MAVEN.GROUP_ID,
            new SourcedSearchExpression("org.sonatype.nexus"));
    final Query artifactIdQ = indexer.constructQuery(MAVEN.ARTIFACT_ID,
            new SourcedSearchExpression("nexus-api"));
    final BooleanQuery query = new BooleanQuery();
    query.add(groupIdQ, BooleanClause.Occur.MUST);
    query.add(artifactIdQ, BooleanClause.Occur.MUST);

    // we want "jar" artifacts only
    query.add(indexer.constructQuery(MAVEN.PACKAGING, new SourcedSearchExpression("jar")),
            BooleanClause.Occur.MUST);
    // we want main artifacts only (no classifier)
    // Note: this below is unfinished API, needs fixing
    query.add(indexer.constructQuery(MAVEN.CLASSIFIER, new SourcedSearchExpression(Field.NOT_PRESENT)),
            BooleanClause.Occur.MUST_NOT);

    // construct the filter to express "V greater than"
    final ArtifactInfoFilter versionFilter = new ArtifactInfoFilter() {
        public boolean accepts(final IndexingContext ctx, final ArtifactInfo ai) {
            try {
                final Version aiV = versionScheme.parseVersion(ai.version);
                // Use ">=" if you are INCLUSIVE
                return aiV.compareTo(version) > 0;
            } catch (InvalidVersionSpecificationException e) {
                // do something here? be safe and include?
                return true;
            }
        }
    };

    System.out.println(
            "Searching for all GAVs with G=org.sonatype.nexus and nexus-api and having V greater than 1.5.0");
    final IteratorSearchRequest request = new IteratorSearchRequest(query,
            Collections.singletonList(centralContext), versionFilter);
    final IteratorSearchResponse response = indexer.searchIterator(request);
    for (ArtifactInfo ai : response) {
        System.out.println(ai.toString());
    }

    // Case:
    // Use index
    // Searching for some artifact
    Query gidQ = indexer.constructQuery(MAVEN.GROUP_ID,
            new SourcedSearchExpression("org.apache.maven.indexer"));
    Query aidQ = indexer.constructQuery(MAVEN.ARTIFACT_ID, new SourcedSearchExpression("indexer-artifact"));

    BooleanQuery bq = new BooleanQuery();
    bq.add(gidQ, BooleanClause.Occur.MUST);
    bq.add(aidQ, BooleanClause.Occur.MUST);

    searchAndDump(indexer, "all artifacts under GA org.apache.maven.indexer:indexer-artifact", bq);

    // Searching for some main artifact
    bq = new BooleanQuery();
    bq.add(gidQ, BooleanClause.Occur.MUST);
    bq.add(aidQ, BooleanClause.Occur.MUST);
    // bq.add( nexusIndexer.constructQuery( MAVEN.CLASSIFIER, new SourcedSearchExpression( "*" ) ), Occur.MUST_NOT
    // );

    searchAndDump(indexer, "main artifacts under GA org.apache.maven.indexer:indexer-artifact", bq);

    // doing sha1 search
    searchAndDump(indexer, "SHA1 7ab67e6b20e5332a7fb4fdf2f019aec4275846c2", indexer.constructQuery(MAVEN.SHA1,
            new SourcedSearchExpression("7ab67e6b20e5332a7fb4fdf2f019aec4275846c2")));

    searchAndDump(indexer, "SHA1 7ab67e6b20 (partial hash)",
            indexer.constructQuery(MAVEN.SHA1, new UserInputSearchExpression("7ab67e6b20")));

    // doing classname search (incomplete classname)
    searchAndDump(indexer,
            "classname DefaultNexusIndexer (note: Central does not publish classes in the index)",
            indexer.constructQuery(MAVEN.CLASSNAMES, new UserInputSearchExpression("DefaultNexusIndexer")));

    // doing search for all "canonical" maven plugins latest versions
    bq = new BooleanQuery();
    bq.add(indexer.constructQuery(MAVEN.PACKAGING, new SourcedSearchExpression("maven-plugin")),
            BooleanClause.Occur.MUST);
    bq.add(indexer.constructQuery(MAVEN.GROUP_ID, new SourcedSearchExpression("org.apache.maven.plugins")),
            BooleanClause.Occur.MUST);
    searchGroupedAndDump(indexer, "all \"canonical\" maven plugins", bq, new GAGrouping());

    // close cleanly
    indexer.closeIndexingContext(centralContext, false);
}