Example usage for org.apache.lucene.index IndexReader numDocs

List of usage examples for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:de.linguatools.disco.DISCO.java

License:Apache License

/***************************************************************************
 * Run trough all documents (i.e. queryable words) in the index, and retrieve
 * the word and its frequency. Write both informations to the file named
 * outputFileName. This method can be used to check index integrity.<br/>
 * @param outputFileName//ww  w.ja  va2s .  c  om
 * @return number of words written to the output file. In case of success the
 * value is equal to the number of words in the index.
 */
public int wordFrequencyList(String outputFileName) {

    // erzeuge einen IndexReader fuer das indexDir
    IndexReader ir = null;
    try {
        if (indexRAM != null) {
            ir = IndexReader.open(indexRAM);
        } else {
            ir = IndexReader.open(FSDirectory.open(new File(indexName)));
        }
    } catch (CorruptIndexException ex) {
        System.out.println(DISCO.class.getName() + ": " + ex);
        return -1;
    } catch (IOException ex) {
        System.out.println(DISCO.class.getName() + ": " + ex);
        return -1;
    }

    // Hole Anzahl Dokumente im Index
    int N = ir.numDocs();

    // ffne Ausgabedatei
    FileWriter fw;
    try {
        fw = new FileWriter(outputFileName);
    } catch (IOException ex) {
        System.out.println(DISCO.class.getName() + ": " + ex);
        return -1;
    }

    // durchlaufe alle Dokumente
    int corrupt = 0;
    int ioerror = 0;
    int i = 0;
    for (i = 0; i < N; i++) {
        Document doc = null;
        try {
            doc = ir.document(i);
        } catch (CorruptIndexException ex) {
            corrupt++;
            continue;
        } catch (IOException ex) {
            ioerror++;
            continue;
        }
        // Wort Nr. i holen
        String word = doc.get("word");
        // Frequenz von Wort i holen
        int f = Integer.parseInt(doc.get("freq"));
        try {
            // Wort und Frequenz in Ausgabe schreiben
            fw.write(word + "\t" + f + "\n");
        } catch (IOException ex) {
            System.out.println(DISCO.class.getName() + ": word " + i + ": " + ex);
            return i;
        }
        // Info ausgeben
        if (i % 100 == 0) {
            System.out.print("\r" + i);
        }
    }
    System.out.println();
    if (corrupt > 0 || ioerror > 0) {
        int e = corrupt + ioerror;
        System.out.println("*** WARNING! ***");
        System.out.println("The language data packet \"" + indexName + "\" " + "has " + e + " defect entries ("
                + corrupt + " corrupt, " + ioerror + " IO errors)");
        System.out.println("All functioning words have been written to " + outputFileName);
    }

    // aufrumen
    try {
        fw.close();
        ir.close();
    } catch (IOException ex) {
        System.out.println(DISCO.class.getName() + ": " + ex);
        return -1;
    }

    return (i - corrupt - ioerror);
}

From source file:de.schlund.pfixcore.lucefix.PfixReadjustment.java

License:Open Source License

/**
 * Checks list of include parts for changes and updates search index.
 *///  ww w.  j av  a2s  . c  o m
public void readjust() {
    Collection<Tripel> partsKnownByPustefix = getUsedTripels();
    IndexReader reader = null;
    PfixQueueManager queue;
    boolean jobDone;
    long startLoop, stopLoop, startCollect, stopCollect, startIndexLoop, stopIndexLoop, startAddLoop,
            stopAddLoop;

    long collectTime = 0;

    int knownDocsSize, newDocs, deleteDocs, numDocs;

    startLoop = stopLoop = startCollect = stopCollect = startIndexLoop = stopIndexLoop = startAddLoop = stopAddLoop = 0;
    newDocs = knownDocsSize = deleteDocs = numDocs = 0;

    startLoop = System.currentTimeMillis();
    Set<Tripel> tripelsToIndex = new TreeSet<Tripel>();

    queue = PfixQueueManager.getInstance(null);
    try {
        jobDone = false;
        startCollect = System.currentTimeMillis();
        partsKnownByPustefix = getUsedTripels();
        stopCollect = System.currentTimeMillis();
        collectTime = stopCollect - startCollect;
        knownDocsSize = partsKnownByPustefix.size();

        try {
            reader = IndexReader.open(LUCENE_DATA);
        } catch (IOException ioe) {
            LOG.warn("broken or nonexistant database -> will queue ALL known parts");

            for (Iterator<Tripel> iter = partsKnownByPustefix.iterator(); iter.hasNext();) {
                Tripel element = iter.next();
                element.setType(Tripel.Type.INSERT);
                newDocs++;
                if (!tripelsToIndex.add(element)) {
                    LOG.debug("duplicated insert");
                }
            }
            jobDone = true;
        }
        if (!jobDone) {
            numDocs = reader.numDocs();
            startIndexLoop = System.currentTimeMillis();
            docloop: for (int i = 0; i < numDocs; i++) {

                Document currentdoc;
                try {
                    currentdoc = reader.document(i);
                } catch (RuntimeException e) {
                    // this happens if we want to access a deleted
                    // document -> continue
                    continue docloop;
                }

                // check if needed
                String path = currentdoc.get(PreDoc.PATH);
                Tripel pfixTripel = new Tripel(path, null);

                if (partsKnownByPustefix.contains(pfixTripel)) {

                    // checkTs
                    File f = new File(GlobalConfig.getDocroot(), currentdoc.get(PreDoc.FILENAME));
                    if (f.lastModified() != DateField.stringToTime(currentdoc.get(PreDoc.LASTTOUCH))) {
                        // ts differs
                        pfixTripel.setType(Tripel.Type.INSERT);
                        LOG.debug("TS differs: " + pfixTripel);
                        newDocs++;
                        if (!tripelsToIndex.add(pfixTripel)) {
                            LOG.debug("duplicated insert " + pfixTripel);
                        }
                    }
                    partsKnownByPustefix.remove(pfixTripel);
                } else {
                    // part not needed anymore
                    Tripel newTripel = new Tripel(currentdoc.get(PreDoc.PATH), Tripel.Type.DELETE);
                    deleteDocs++;
                    queue.queue(newTripel);
                }

            }
            stopIndexLoop = System.currentTimeMillis();

            // now partsKnownByPustefix only contains parts which are NOT indexed...
            startAddLoop = System.currentTimeMillis();
            for (Iterator<Tripel> iter = partsKnownByPustefix.iterator(); iter.hasNext();) {
                Tripel element = iter.next();
                element.setType(Tripel.Type.INSERT);
                // LOG.debug("adding " + element + " to queue
                // (INDEX)");
                newDocs++;
                if (!tripelsToIndex.add(element)) {
                    LOG.debug("duplicated insert " + element);
                }
                // queue.queue(element);
            }

            stopAddLoop = System.currentTimeMillis();
        }
    } catch (IOException ioe) {
        LOG.error("error reading index", ioe);
    }

    // its a treeset, it is already sorted :)
    // Collections.sort(tripelsToIndex);
    // Collections.
    for (Tripel tripel : tripelsToIndex) {
        queue.queue(tripel);
    }

    stopLoop = System.currentTimeMillis();
    long needed = stopLoop - startLoop;
    if (newDocs != 0 || deleteDocs != 0) {
        LOG.debug(needed + "ms (getUsedTripels(): " + collectTime + "ms (" + knownDocsSize + "u) indexloop: "
                + (stopIndexLoop - startIndexLoop) + "|" + (stopAddLoop - startAddLoop) + "ms (" + numDocs
                + "u), added " + newDocs + "+" + deleteDocs + " queueitems");
    }

    try {
        if (reader != null) {
            reader.close();
            reader = null;
        }
    } catch (IOException e) {
        LOG.error("error while closing reader", e);
    }
}

From source file:de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.LuceneIndexerTest.java

License:Apache License

@Test
public void testSearch() throws Exception {
    // Check if fields and all documents exists
    IndexReader ir0 = IndexReader.open(FSDirectory.open(targetIndex0));
    IndexReader ir1 = IndexReader.open(FSDirectory.open(targetIndex1));
    Assert.assertEquals("Number of documents", 3, ir0.numDocs() + ir1.numDocs());

    Document doc = ir0.document(0);
    Assert.assertNotNull("Field: gram", doc.getField("gram"));
    Assert.assertNotNull("Field: freq", doc.getField("freq"));
    ir0.close();/* w  ww . j  a  va  2 s .  c  o  m*/
    ir1.close();

    // Search on the index
    Finder f = new Finder(index, jWeb1T);

    Assert.assertEquals(f.find("relax").size(), 3);
    Assert.assertEquals(f.find("couch").size(), 1);
    Assert.assertEquals(f.find("relax couch").size(), 1);
    Assert.assertEquals(f.find("couchdb").size(), 1);
}

From source file:de.tudarmstadt.ukp.teaching.uima.nounDecompounding.web1t.LuceneIndexerTest.java

License:Open Source License

@Test
public void testSearch() throws Exception {
    // Check if fields and all documents exists
    IndexReader ir = IndexReader.open(FSDirectory.open(targetIndex));
    Assert.assertEquals("Number of documents", 2, ir.numDocs());
    Document doc = ir.document(0);
    Assert.assertNotNull("Field: gram", doc.getField("gram"));
    Assert.assertNotNull("Field: freq", doc.getField("freq"));
    ir.close();/*from w w  w. ja  v a 2s  .c om*/

    // Search on the index
    IndexSearcher searcher = new IndexSearcher(FSDirectory.open(targetIndex));
    QueryParser p = new QueryParser(Version.LUCENE_30, "token", new StandardAnalyzer(Version.LUCENE_30));
    Query q = p.parse("gram:relax");
    Assert.assertEquals("Hit count 'Relax'", 2, searcher.search(q, 100).totalHits);

    q = p.parse("gram:couch");
    Assert.assertEquals("Hit count 'couch'", 1, searcher.search(q, 100).totalHits);

    q = p.parse("gram:relax AND gram:couch");
    Assert.assertEquals("Hit count 'couch'", 1, searcher.search(q, 100).totalHits);

    q = p.parse("gram:couchdb");
    Assert.assertEquals("Hit count 'couchdb'", 1, searcher.search(q, 100).totalHits);
    searcher.close();
}

From source file:de.unihildesheim.iw.cli.DumpIPCs.java

License:Open Source License

private void runMain(final String... args) throws IOException, BuildException {
    new CmdLineParser(this.cliParams);
    parseWithHelp(this.cliParams, args);

    // check, if files and directories are sane
    this.cliParams.check();

    assert this.cliParams.idxReader != null;
    final int maxDoc = this.cliParams.idxReader.maxDoc();
    if (maxDoc == 0) {
        LOG.error("Empty index.");
        return;//from w w w  .j  a v a2  s. c o m
    }

    final Parser ipcParser = new Parser();
    ipcParser.separatorChar(this.cliParams.sep);
    ipcParser.allowZeroPad(this.cliParams.zeroPad);

    final DirectoryReader reader = DirectoryReader.open(FSDirectory.open(this.cliParams.idxDir.toPath()));
    final Builder idxReaderBuilder = new Builder(reader);

    Pattern rx_ipc = null;

    if (this.cliParams.ipc != null) {
        final IPCRecord ipc = ipcParser.parse(this.cliParams.ipc);
        final BooleanQuery bq = new BooleanQuery();
        rx_ipc = Pattern.compile(ipc.toRegExpString(this.cliParams.sep));
        if (LOG.isDebugEnabled()) {
            LOG.debug("IPC regExp: rx={} pat={}", ipc.toRegExpString(this.cliParams.sep), rx_ipc);
        }

        bq.add(new QueryWrapperFilter(IPCClassQuery.get(ipc, this.cliParams.sep)), Occur.MUST);
        bq.add(new QueryWrapperFilter(
                new IPCFieldFilter(new IPCFieldFilterFunctions.SloppyMatch(ipc), ipcParser)), Occur.MUST);
        idxReaderBuilder.queryFilter(new QueryWrapperFilter(bq));
    }

    final IndexReader idxReader = idxReaderBuilder.build();

    if (idxReader.numDocs() > 0) {
        final Terms terms = MultiFields.getTerms(idxReader, LUCENE_CONF.FLD_IPC);
        TermsEnum termsEnum = TermsEnum.EMPTY;
        BytesRef term;
        if (terms != null) {
            termsEnum = terms.iterator(termsEnum);
            term = termsEnum.next();

            final int[] count = { 0, 0 }; // match, exclude
            while (term != null) {
                final String code = term.utf8ToString();
                if (rx_ipc == null || (rx_ipc.matcher(code).matches())) {
                    final IPCRecord record = ipcParser.parse(code);
                    try {
                        System.out.println(code + ' ' + record + " (" + record.toFormattedString() + ") " + '['
                                + record.toRegExpString('-') + ']');
                    } catch (final IllegalArgumentException e) {
                        System.out.println(code + ' ' + "INVALID (" + code + ')');
                    }
                    count[0]++;
                } else {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Skip non matching IPC: {}", code);
                    }
                    count[1]++;
                }
                term = termsEnum.next();
            }
            LOG.info("match={} skip={}", count[0], count[1]);
        }
    } else {
        LOG.info("No documents left after filtering.");
    }
}

From source file:dk.defxws.fedoragsearch.server.Config.java

License:Open Source License

private void checkConfig() throws ConfigException {

    if (logger.isDebugEnabled())
        logger.debug("fedoragsearch.properties=" + fgsProps.toString());

    //     Check for unknown properties, indicating typos or wrong property names
    String[] propNames = { "fedoragsearch.deployFile", "fedoragsearch.soapBase", "fedoragsearch.soapUser",
            "fedoragsearch.soapPass", "fedoragsearch.defaultNoXslt",
            "fedoragsearch.defaultGfindObjectsRestXslt", "fedoragsearch.defaultUpdateIndexRestXslt",
            "fedoragsearch.defaultBrowseIndexRestXslt", "fedoragsearch.defaultGetRepositoryInfoRestXslt",
            "fedoragsearch.defaultGetIndexInfoRestXslt", "fedoragsearch.mimeTypes", "fedoragsearch.maxPageSize",
            "fedoragsearch.defaultBrowseIndexTermPageSize", "fedoragsearch.defaultGfindObjectsHitPageSize",
            "fedoragsearch.defaultGfindObjectsSnippetsMax", "fedoragsearch.defaultGfindObjectsFieldMaxLength",
            "fedoragsearch.repositoryNames", "fedoragsearch.indexNames", "fedoragsearch.updaterNames",
            "fedoragsearch.searchResultFilteringModule", "fedoragsearch.searchResultFilteringType" };
    //checkPropNames("fedoragsearch.properties", fgsProps, propNames);

    //     Check rest stylesheets
    checkRestStylesheet("fedoragsearch.defaultNoXslt");
    checkRestStylesheet("fedoragsearch.defaultGfindObjectsRestXslt");
    checkRestStylesheet("fedoragsearch.defaultUpdateIndexRestXslt");
    checkRestStylesheet("fedoragsearch.defaultBrowseIndexRestXslt");
    checkRestStylesheet("fedoragsearch.defaultGetRepositoryInfoRestXslt");
    checkRestStylesheet("fedoragsearch.defaultGetIndexInfoRestXslt");

    //     Check mimeTypes  
    checkMimeTypes("fedoragsearch", fgsProps, "fedoragsearch.mimeTypes");

    //     Check resultPage properties
    try {//w  w w .ja va2 s .  co m
        maxPageSize = Integer.parseInt(fgsProps.getProperty("fedoragsearch.maxPageSize"));
    } catch (NumberFormatException e) {
        errors.append("\n*** maxPageSize is not valid:\n" + e.toString());
    }
    try {
        defaultBrowseIndexTermPageSize = Integer
                .parseInt(fgsProps.getProperty("fedoragsearch.defaultBrowseIndexTermPageSize"));
    } catch (NumberFormatException e) {
        errors.append("\n*** defaultBrowseIndexTermPageSize is not valid:\n" + e.toString());
    }
    try {
        defaultGfindObjectsHitPageSize = Integer
                .parseInt(fgsProps.getProperty("fedoragsearch.defaultGfindObjectsHitPageSize"));
    } catch (NumberFormatException e) {
        errors.append("\n*** defaultGfindObjectsHitPageSize is not valid:\n" + e.toString());
    }
    try {
        defaultGfindObjectsSnippetsMax = Integer
                .parseInt(fgsProps.getProperty("fedoragsearch.defaultGfindObjectsSnippetsMax"));
    } catch (NumberFormatException e) {
        errors.append("\n*** defaultGfindObjectsSnippetsMax is not valid:\n" + e.toString());
    }
    try {
        defaultGfindObjectsFieldMaxLength = Integer
                .parseInt(fgsProps.getProperty("fedoragsearch.defaultGfindObjectsFieldMaxLength"));
    } catch (NumberFormatException e) {
        errors.append("\n*** defaultGfindObjectsFieldMaxLength is not valid:\n" + e.toString());
    }

    // Check updater properties
    String updaterProperty = fgsProps.getProperty("fedoragsearch.updaterNames");
    if (updaterProperty == null) {
        updaterNameToProps = null; // No updaters will be created
    } else {
        updaterNameToProps = new Hashtable();
        StringTokenizer updaterNames = new StringTokenizer(updaterProperty);
        while (updaterNames.hasMoreTokens()) {
            String updaterName = updaterNames.nextToken();
            try {
                InputStream propStream = null;
                try {
                    propStream = getResourceInputStream("/updater/" + updaterName + "/updater.properties");
                } catch (ConfigException e) {
                    errors.append("\n" + e.getMessage());
                }
                Properties props = new Properties();
                props.load(propStream);
                propStream.close();

                //MIH
                convertProperties(props);
                if (logger.isInfoEnabled()) {
                    logger.info(
                            configName + "/updater/" + updaterName + "/updater.properties=" + props.toString());
                }

                // Check properties
                String propsNamingFactory = props.getProperty("java.naming.factory.initial");
                String propsProviderUrl = props.getProperty("java.naming.provider.url");
                String propsConnFactory = props.getProperty("connection.factory.name");
                String propsClientId = props.getProperty("client.id");

                if (propsNamingFactory == null) {
                    errors.append("\n*** java.naming.factory.initial not provided in " + configName
                            + "/updater/" + updaterName + "/updater.properties");
                }
                if (propsProviderUrl == null) {
                    errors.append("\n*** java.naming.provider.url not provided in " + configName + "/updater/"
                            + updaterName + "/updater.properties");
                }
                if (propsConnFactory == null) {
                    errors.append("\n*** connection.factory.name not provided in " + configName + "/updater/"
                            + updaterName + "/updater.properties");
                }
                if (propsClientId == null) {
                    errors.append("\n*** client.id not provided in " + configName + "/updater/" + updaterName
                            + "/updater.properties");
                }

                updaterNameToProps.put(updaterName, props);
            } catch (IOException e) {
                errors.append("\n*** Error loading " + configName + "/updater/" + updaterName + ".properties:\n"
                        + e.toString());
            }
        }
    }

    // Check searchResultFilteringModule property
    searchResultFilteringModuleProperty = fgsProps.getProperty("fedoragsearch.searchResultFilteringModule");
    if (searchResultFilteringModuleProperty != null && searchResultFilteringModuleProperty.length() > 0) {
        try {
            getSearchResultFiltering();
        } catch (ConfigException e) {
            errors.append(e.getMessage());
        }
        String searchResultFilteringTypeProperty = fgsProps
                .getProperty("fedoragsearch.searchResultFilteringType");
        StringTokenizer srft = new StringTokenizer("");
        if (searchResultFilteringTypeProperty != null) {
            srft = new StringTokenizer(searchResultFilteringTypeProperty);
        }
        int countTokens = srft.countTokens();
        if (searchResultFilteringTypeProperty == null || countTokens == 0 || countTokens > 1) {
            errors.append("\n*** " + configName + ": fedoragsearch.searchResultFilteringType="
                    + searchResultFilteringTypeProperty
                    + ": one and only one of 'presearch', 'insearch', 'postsearch' must be stated.\n");
        } else {
            for (int i = 0; i < countTokens; i++) {
                String token = srft.nextToken();
                if (!("presearch".equals(token) || "insearch".equals(token) || "postsearch".equals(token))) {
                    errors.append("\n*** " + configName + ": fedoragsearch.searchResultFilteringType="
                            + searchResultFilteringTypeProperty
                            + ": only 'presearch', 'insearch', 'postsearch' may be stated, not '" + token
                            + "'.\n");
                }
            }
        }
    }

    //     Check repository properties
    Enumeration repositoryNames = repositoryNameToProps.keys();
    while (repositoryNames.hasMoreElements()) {
        String repositoryName = (String) repositoryNames.nextElement();
        Properties props = (Properties) repositoryNameToProps.get(repositoryName);
        if (logger.isDebugEnabled())
            logger.debug(configName + "/repository/" + repositoryName + "/repository.properties="
                    + props.toString());

        //        Check for unknown properties, indicating typos or wrong property names
        String[] reposPropNames = { "fgsrepository.repositoryName", "fgsrepository.fedoraSoap",
                "fgsrepository.fedoraUser", "fgsrepository.fedoraPass", "fgsrepository.fedoraObjectDir",
                "fgsrepository.fedoraVersion", "fgsrepository.defaultGetRepositoryInfoResultXslt",
                "fgsrepository.trustStorePath", "fgsrepository.trustStorePass" };
        //checkPropNames(configName+"/repository/"+repositoryName+"/repository.properties", props, reposPropNames);

        //        Check repositoryName
        String propsRepositoryName = props.getProperty("fgsrepository.repositoryName");
        if (!repositoryName.equals(propsRepositoryName)) {
            errors.append("\n*** " + configName + "/repository/" + repositoryName
                    + ": fgsrepository.repositoryName must be=" + repositoryName);
        }

        //        Check fedoraObjectDir
        //          String fedoraObjectDirName = insertSystemProperties(props.getProperty("fgsrepository.fedoraObjectDir"));
        //          File fedoraObjectDir = new File(fedoraObjectDirName);
        //          if (fedoraObjectDir == null) {
        //             errors.append("\n*** "+configName+"/repository/" + repositoryName
        //                   + ": fgsrepository.fedoraObjectDir="
        //                   + fedoraObjectDirName + " not found");
        //          }

        //        Check result stylesheets
        checkResultStylesheet("/repository/" + repositoryName, props,
                "fgsrepository.defaultGetRepositoryInfoResultXslt");
    }

    //     Check index properties
    Enumeration indexNames = indexNameToProps.keys();
    while (indexNames.hasMoreElements()) {
        String indexName = (String) indexNames.nextElement();
        Properties props = (Properties) indexNameToProps.get(indexName);
        if (logger.isDebugEnabled())
            logger.debug(configName + "/index/" + indexName + "/index.properties=" + props.toString());

        //        Check for unknown properties, indicating typos or wrong property names
        String[] indexPropNames = { "fgsindex.indexName", "fgsindex.indexBase", "fgsindex.indexUser",
                "fgsindex.indexPass", "fgsindex.operationsImpl", "fgsindex.defaultUpdateIndexDocXslt",
                "fgsindex.defaultUpdateIndexResultXslt", "fgsindex.defaultGfindObjectsResultXslt",
                "fgsindex.defaultBrowseIndexResultXslt", "fgsindex.defaultGetIndexInfoResultXslt",
                "fgsindex.indexDir", "fgsindex.analyzer", "fgsindex.untokenizedFields",
                "fgsindex.defaultQueryFields", "fgsindex.snippetBegin", "fgsindex.snippetEnd",
                "fgsindex.maxBufferedDocs", "fgsindex.mergeFactor", "fgsindex.ramBufferSizeMb",
                "fgsindex.defaultWriteLockTimeout", "fgsindex.defaultSortFields", "fgsindex.uriResolver" };
        //checkPropNames(configName+"/index/"+indexName+"/index.properties", props, indexPropNames);

        //        Check indexName
        String propsIndexName = props.getProperty("fgsindex.indexName");
        if (!indexName.equals(propsIndexName)) {
            errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.indexName must be="
                    + indexName);
        }

        //        Check operationsImpl class
        String operationsImpl = props.getProperty("fgsindex.operationsImpl");
        if (operationsImpl == null || operationsImpl.equals("")) {
            errors.append(
                    "\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl must be set in "
                            + configName + "/index/ " + indexName + ".properties");
        }
        try {
            Class operationsImplClass = Class.forName(operationsImpl);
            try {
                GenericOperationsImpl ops = (GenericOperationsImpl) operationsImplClass
                        .getConstructor(new Class[] {}).newInstance(new Object[] {});
            } catch (InstantiationException e) {
                errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl="
                        + operationsImpl + ": instantiation error.\n" + e.toString());
            } catch (IllegalAccessException e) {
                errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl="
                        + operationsImpl + ": instantiation error.\n" + e.toString());
            } catch (InvocationTargetException e) {
                errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl="
                        + operationsImpl + ": instantiation error.\n" + e.toString());
            } catch (NoSuchMethodException e) {
                errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl="
                        + operationsImpl + ": instantiation error.\n" + e.toString());
            }
        } catch (ClassNotFoundException e) {
            errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl="
                    + operationsImpl + ": class not found.\n" + e);
        }

        //        Check result stylesheets
        checkResultStylesheet("/index/" + indexName, props, "fgsindex.defaultUpdateIndexDocXslt");
        checkResultStylesheet("/index/" + indexName, props, "fgsindex.defaultUpdateIndexResultXslt");
        checkResultStylesheet("/index/" + indexName, props, "fgsindex.defaultGfindObjectsResultXslt");
        checkResultStylesheet("/index/" + indexName, props, "fgsindex.defaultBrowseIndexResultXslt");
        checkResultStylesheet("/index/" + indexName, props, "fgsindex.defaultGetIndexInfoResultXslt");

        //        Check indexDir
        String indexDir = insertSystemProperties(props.getProperty("fgsindex.indexDir"));
        File indexDirFile = new File(indexDir);
        if (indexDirFile == null) {
            errors.append("\n*** " + configName + "/index/" + indexName + " fgsindex.indexDir=" + indexDir
                    + " must exist as a directory");
        }

        //        Check analyzer class for lucene and solr
        if (operationsImpl.indexOf("fgslucene") > -1 || operationsImpl.indexOf("fgssolr") > -1) {
            String analyzer = props.getProperty("fgsindex.analyzer");
            if (analyzer == null || analyzer.equals("")) {
                analyzer = defaultAnalyzer;
            }
            try {
                Class analyzerClass = Class.forName(analyzer);
                try {
                    Analyzer a = (Analyzer) analyzerClass.getConstructor(new Class[] {})
                            .newInstance(new Object[] {});
                } catch (InstantiationException e) {
                    errors.append("\n*** " + configName + "/index/" + indexName + " " + analyzer
                            + ": fgsindex.analyzer=" + analyzer + ": instantiation error.\n" + e.toString());
                } catch (IllegalAccessException e) {
                    errors.append("\n*** " + configName + "/index/" + indexName + " " + analyzer
                            + ": fgsindex.analyzer=" + analyzer + ": instantiation error.\n" + e.toString());
                } catch (InvocationTargetException e) {
                    errors.append("\n*** " + configName + "/index/" + indexName + " " + analyzer
                            + ": fgsindex.analyzer=" + analyzer + ": instantiation error.\n" + e.toString());
                } catch (NoSuchMethodException e) {
                    errors.append("\n*** " + configName + "/index/" + indexName + " " + analyzer
                            + ": fgsindex.analyzer=" + analyzer + ": instantiation error:\n" + e.toString());
                }
            } catch (ClassNotFoundException e) {
                errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.analyzer=" + analyzer
                        + ": class not found:\n" + e.toString());
            }
        }

        //        Add untokenizedFields property for lucene
        if (operationsImpl.indexOf("fgslucene") > -1) {
            String defaultUntokenizedFields = props.getProperty("fgsindex.untokenizedFields");
            if (defaultUntokenizedFields == null)
                props.setProperty("fgsindex.untokenizedFields", "");
            if (indexDirFile != null) {
                StringBuffer untokenizedFields = new StringBuffer(
                        props.getProperty("fgsindex.untokenizedFields"));
                IndexReader ir = null;
                try {
                    ir = IndexReader.open(FSDirectory.open(new File(indexDir)), true);
                    int max = ir.numDocs();
                    if (max > 10)
                        max = 10;
                    for (int i = 0; i < max; i++) {
                        Document doc = ir.document(i);
                        for (ListIterator li = doc.getFields().listIterator(); li.hasNext();) {
                            Field f = (Field) li.next();
                            if (!f.isTokenized() && f.isIndexed() && untokenizedFields.indexOf(f.name()) < 0) {
                                untokenizedFields.append(" " + f.name());
                            }
                        }
                    }
                } catch (Exception e) {
                }
                props.setProperty("fgsindex.untokenizedFields", untokenizedFields.toString());
                if (logger.isDebugEnabled())
                    logger.debug("indexName=" + indexName + " fgsindex.untokenizedFields=" + untokenizedFields);
            }
        }

        //        Check defaultQueryFields - how can we check this?
        String defaultQueryFields = props.getProperty("fgsindex.defaultQueryFields");

        //        Use custom URIResolver if given
        //MIH: also check for solr
        if (operationsImpl.indexOf("fgslucene") > -1 || operationsImpl.indexOf("fgssolr") > -1) {
            Class uriResolverClass = null;
            String uriResolver = props.getProperty("fgsindex.uriResolver");
            if (!(uriResolver == null || uriResolver.equals(""))) {
                try {
                    uriResolverClass = Class.forName(uriResolver);
                    try {
                        URIResolverImpl ur = (URIResolverImpl) uriResolverClass.getConstructor(new Class[] {})
                                .newInstance(new Object[] {});
                        if (ur != null) {
                            ur.setConfig(this);
                            indexNameToUriResolvers.put(indexName, ur);
                        }
                    } catch (InstantiationException e) {
                        errors.append("\n*** " + configName + "/index/" + indexName + " " + uriResolver
                                + ": fgsindex.uriResolver=" + uriResolver + ": instantiation error.\n"
                                + e.toString());
                    } catch (IllegalAccessException e) {
                        errors.append("\n*** " + configName + "/index/" + indexName + " " + uriResolver
                                + ": fgsindex.uriResolver=" + uriResolver + ": instantiation error.\n"
                                + e.toString());
                    } catch (InvocationTargetException e) {
                        errors.append("\n*** " + configName + "/index/" + indexName + " " + uriResolver
                                + ": fgsindex.uriResolver=" + uriResolver + ": instantiation error.\n"
                                + e.toString());
                    } catch (NoSuchMethodException e) {
                        errors.append("\n*** " + configName + "/index/" + indexName + " " + uriResolver
                                + ": fgsindex.uriResolver=" + uriResolver + ": instantiation error:\n"
                                + e.toString());
                    }
                } catch (ClassNotFoundException e) {
                    errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.uriResolver="
                            + uriResolver + ": class not found:\n" + e.toString());
                }
            }
        }
    }
    if (logger.isDebugEnabled())
        logger.debug("configCheck configName=" + configName + " errors=" + errors.toString());
    if (errors.length() > 0)
        throw new ConfigException(errors.toString());
}

From source file:drakkar.mast.retrieval.SVNContext.java

/**
 * {@inheritDoc}//from  w  ww.  ja  v  a 2  s.c om
 */
public boolean loadIndex(File indexPath) throws IOException, IndexException {
    IndexReader reader = null;
    boolean flag = false;

    if (!indexPath.isDirectory() || !indexPath.exists() || indexPath == null
            || IndexReader.indexExists(FSDirectory.open(indexPath)) == false) {
        message = "Not found index in default index path";
        OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
        throw new IndexException(message);

    } else {

        reader = IndexReader.open(FSDirectory.open(indexPath));
        loadedDocs = reader.numDocs();
        reader.close();

        message = "Loading SVN index...";
        OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
        this.notifyTaskProgress(INFORMATION_MESSAGE, message);
        try {
            Thread.sleep(2000);
        } catch (InterruptedException ex) {
            message = "Error loading index: " + ex.toString();
            OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
            this.notifyTaskProgress(ERROR_MESSAGE, message);
        }
        message = "Total of documents of the index: " + loadedDocs;
        OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
        this.notifyTaskProgress(INFORMATION_MESSAGE, message);
        flag = true;
        this.notifyLoadedDocument(loadedDocs);

    }

    return flag;
}

From source file:drakkar.mast.retrieval.SVNContext.java

/**
 * {@inheritDoc}/*from   ww  w.j  a v  a  2  s. c  om*/
 */
public boolean loadIndex() throws IndexException, IOException {
    IndexReader reader = null;
    File defaultFile = new File(this.defaultIndexPath);

    boolean flag = false;

    if (!defaultFile.isDirectory() || !defaultFile.exists() || defaultFile == null
            || IndexReader.indexExists(FSDirectory.open(defaultFile)) == false) {
        message = "Not found index in default index path";
        OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
        throw new IndexException(message);

    } else {

        reader = IndexReader.open(FSDirectory.open(defaultFile));
        loadedDocs = reader.numDocs();
        reader.close();

        message = "Loading SVN index...";
        OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
        this.notifyTaskProgress(INFORMATION_MESSAGE, message);
        try {
            Thread.sleep(2000);
        } catch (InterruptedException ex) {
            message = "Error loading index: " + ex.toString();
            OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE);
            this.notifyTaskProgress(ERROR_MESSAGE, message);
        }
        message = "Total of documents of the index: " + loadedDocs;
        OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE);
        this.notifyTaskProgress(INFORMATION_MESSAGE, message);
        flag = true;
        this.notifyLoadedDocument(loadedDocs);

    }

    return flag;
}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.lucene.CorpusHighFreqTerms.java

License:Open Source License

/**
 *
 * Extracts the top n most frequent terms (by document frequency ) from an existing Lucene index
 * (the dir must be specified via args or via tfidfTester on
 * /resources/lite/configs/general.conf) in this case the Wikipedia corpus) and reports their
 * document frequency./*from w  w w  . j a va 2  s  . co  m*/
 *
 * @param args
 * @throws java.lang.Exception
 */
public static void main(String[] args) throws Exception {
    try {
        Properties prop = new Properties();
        InputStream is = new FileInputStream("resources/lite/configs/general.conf");
        FSDirectory dir;
        if (args.length == 1) {
            if (Paths.get(args[0]).toFile().isDirectory()) {
                dir = FSDirectory.open(new File(args[0]));
            } else {
                System.out.println("The specified directory does not exist\n"
                        + " backing to load the lucene index specified in the config files");
                dir = FSDirectory.open(new File(prop.getProperty("tfidfTester")));
            }
        } else if (args.length > 1) {
            System.out.println("The args only need one parameter, the directory of the Lucene Index\n "
                    + "backing to load the lucene index specified in the config files");
            dir = FSDirectory.open(new File(prop.getProperty("tfidfTester")));
        } else {
            dir = FSDirectory.open(new File(prop.getProperty("tfidfTester")));
        }
        IndexReader reader = null;
        String field = null;
        boolean IncludeTermFreqs = false;
        prop.load(is);
        IncludeTermFreqs = true;
        reader = DirectoryReader.open(dir);
        System.out.println("num Docs " + reader.numDocs());
        TermStats[] terms = getHighFreqTerms(reader, numTerms, field);
        if (!IncludeTermFreqs) {
            //default HighFreqTerms behavior
            for (int i = 0; i < terms.length; i++) {
                System.out.printf("%s:%s %,d \n", terms[i].field, terms[i].termtext.utf8ToString(),
                        terms[i].docFreq);
            }
        } else {
            TermStats[] termsWithTF = sortByTotalTermFreq(reader, terms);
            for (int i = 0; i < termsWithTF.length; i++) {
                System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n", termsWithTF[i].field,
                        termsWithTF[i].termtext.utf8ToString(), termsWithTF[i].totalTermFreq,
                        termsWithTF[i].docFreq);
            }
        }
        reader.close();
    } catch (Exception ex) {
        logger.error("The directory specified contains a Lucene index?", ex);
    }
}

From source file:edu.illinois.cs.cogcomp.bigdata.lucene.Lucene.java

License:Open Source License

/**
 * uses custom similarity to compute idf, use this if you want to implement
 * IDF(numDocs,docFreq)// ww w  . j  a  va2  s .  com
 * 
 * @param reader
 * @param field
 * @param tfidfSIM
 * @return
 * @throws IOException
 */
public static Map<String, Float> getIdfs(IndexReader reader, String field, TFIDFSimilarity tfidfSIM)
        throws IOException {
    Map<String, Float> docFrequencies = new HashMap<>();

    TermsEnum termEnum = MultiFields.getTerms(reader, field).iterator();
    BytesRef bytesRef;
    while ((bytesRef = termEnum.next()) != null) {
        if (termEnum.seekExact(bytesRef)) {
            String term = bytesRef.utf8ToString();

            float idf = tfidfSIM.idf(termEnum.docFreq(), reader.numDocs());
            docFrequencies.put(term, idf);
        }
    }

    return docFrequencies;
}