List of usage examples for org.apache.lucene.index IndexReader numDocs
public abstract int numDocs();
From source file:de.linguatools.disco.DISCO.java
License:Apache License
/*************************************************************************** * Run trough all documents (i.e. queryable words) in the index, and retrieve * the word and its frequency. Write both informations to the file named * outputFileName. This method can be used to check index integrity.<br/> * @param outputFileName//ww w.ja va2s . c om * @return number of words written to the output file. In case of success the * value is equal to the number of words in the index. */ public int wordFrequencyList(String outputFileName) { // erzeuge einen IndexReader fuer das indexDir IndexReader ir = null; try { if (indexRAM != null) { ir = IndexReader.open(indexRAM); } else { ir = IndexReader.open(FSDirectory.open(new File(indexName))); } } catch (CorruptIndexException ex) { System.out.println(DISCO.class.getName() + ": " + ex); return -1; } catch (IOException ex) { System.out.println(DISCO.class.getName() + ": " + ex); return -1; } // Hole Anzahl Dokumente im Index int N = ir.numDocs(); // ffne Ausgabedatei FileWriter fw; try { fw = new FileWriter(outputFileName); } catch (IOException ex) { System.out.println(DISCO.class.getName() + ": " + ex); return -1; } // durchlaufe alle Dokumente int corrupt = 0; int ioerror = 0; int i = 0; for (i = 0; i < N; i++) { Document doc = null; try { doc = ir.document(i); } catch (CorruptIndexException ex) { corrupt++; continue; } catch (IOException ex) { ioerror++; continue; } // Wort Nr. i holen String word = doc.get("word"); // Frequenz von Wort i holen int f = Integer.parseInt(doc.get("freq")); try { // Wort und Frequenz in Ausgabe schreiben fw.write(word + "\t" + f + "\n"); } catch (IOException ex) { System.out.println(DISCO.class.getName() + ": word " + i + ": " + ex); return i; } // Info ausgeben if (i % 100 == 0) { System.out.print("\r" + i); } } System.out.println(); if (corrupt > 0 || ioerror > 0) { int e = corrupt + ioerror; System.out.println("*** WARNING! ***"); System.out.println("The language data packet \"" + indexName + "\" " + "has " + e + " defect entries (" + corrupt + " corrupt, " + ioerror + " IO errors)"); System.out.println("All functioning words have been written to " + outputFileName); } // aufrumen try { fw.close(); ir.close(); } catch (IOException ex) { System.out.println(DISCO.class.getName() + ": " + ex); return -1; } return (i - corrupt - ioerror); }
From source file:de.schlund.pfixcore.lucefix.PfixReadjustment.java
License:Open Source License
/** * Checks list of include parts for changes and updates search index. */// ww w. j av a2s . c o m public void readjust() { Collection<Tripel> partsKnownByPustefix = getUsedTripels(); IndexReader reader = null; PfixQueueManager queue; boolean jobDone; long startLoop, stopLoop, startCollect, stopCollect, startIndexLoop, stopIndexLoop, startAddLoop, stopAddLoop; long collectTime = 0; int knownDocsSize, newDocs, deleteDocs, numDocs; startLoop = stopLoop = startCollect = stopCollect = startIndexLoop = stopIndexLoop = startAddLoop = stopAddLoop = 0; newDocs = knownDocsSize = deleteDocs = numDocs = 0; startLoop = System.currentTimeMillis(); Set<Tripel> tripelsToIndex = new TreeSet<Tripel>(); queue = PfixQueueManager.getInstance(null); try { jobDone = false; startCollect = System.currentTimeMillis(); partsKnownByPustefix = getUsedTripels(); stopCollect = System.currentTimeMillis(); collectTime = stopCollect - startCollect; knownDocsSize = partsKnownByPustefix.size(); try { reader = IndexReader.open(LUCENE_DATA); } catch (IOException ioe) { LOG.warn("broken or nonexistant database -> will queue ALL known parts"); for (Iterator<Tripel> iter = partsKnownByPustefix.iterator(); iter.hasNext();) { Tripel element = iter.next(); element.setType(Tripel.Type.INSERT); newDocs++; if (!tripelsToIndex.add(element)) { LOG.debug("duplicated insert"); } } jobDone = true; } if (!jobDone) { numDocs = reader.numDocs(); startIndexLoop = System.currentTimeMillis(); docloop: for (int i = 0; i < numDocs; i++) { Document currentdoc; try { currentdoc = reader.document(i); } catch (RuntimeException e) { // this happens if we want to access a deleted // document -> continue continue docloop; } // check if needed String path = currentdoc.get(PreDoc.PATH); Tripel pfixTripel = new Tripel(path, null); if (partsKnownByPustefix.contains(pfixTripel)) { // checkTs File f = new File(GlobalConfig.getDocroot(), currentdoc.get(PreDoc.FILENAME)); if (f.lastModified() != DateField.stringToTime(currentdoc.get(PreDoc.LASTTOUCH))) { // ts differs pfixTripel.setType(Tripel.Type.INSERT); LOG.debug("TS differs: " + pfixTripel); newDocs++; if (!tripelsToIndex.add(pfixTripel)) { LOG.debug("duplicated insert " + pfixTripel); } } partsKnownByPustefix.remove(pfixTripel); } else { // part not needed anymore Tripel newTripel = new Tripel(currentdoc.get(PreDoc.PATH), Tripel.Type.DELETE); deleteDocs++; queue.queue(newTripel); } } stopIndexLoop = System.currentTimeMillis(); // now partsKnownByPustefix only contains parts which are NOT indexed... startAddLoop = System.currentTimeMillis(); for (Iterator<Tripel> iter = partsKnownByPustefix.iterator(); iter.hasNext();) { Tripel element = iter.next(); element.setType(Tripel.Type.INSERT); // LOG.debug("adding " + element + " to queue // (INDEX)"); newDocs++; if (!tripelsToIndex.add(element)) { LOG.debug("duplicated insert " + element); } // queue.queue(element); } stopAddLoop = System.currentTimeMillis(); } } catch (IOException ioe) { LOG.error("error reading index", ioe); } // its a treeset, it is already sorted :) // Collections.sort(tripelsToIndex); // Collections. for (Tripel tripel : tripelsToIndex) { queue.queue(tripel); } stopLoop = System.currentTimeMillis(); long needed = stopLoop - startLoop; if (newDocs != 0 || deleteDocs != 0) { LOG.debug(needed + "ms (getUsedTripels(): " + collectTime + "ms (" + knownDocsSize + "u) indexloop: " + (stopIndexLoop - startIndexLoop) + "|" + (stopAddLoop - startAddLoop) + "ms (" + numDocs + "u), added " + newDocs + "+" + deleteDocs + " queueitems"); } try { if (reader != null) { reader.close(); reader = null; } } catch (IOException e) { LOG.error("error while closing reader", e); } }
From source file:de.tudarmstadt.ukp.dkpro.core.decompounding.web1t.LuceneIndexerTest.java
License:Apache License
@Test public void testSearch() throws Exception { // Check if fields and all documents exists IndexReader ir0 = IndexReader.open(FSDirectory.open(targetIndex0)); IndexReader ir1 = IndexReader.open(FSDirectory.open(targetIndex1)); Assert.assertEquals("Number of documents", 3, ir0.numDocs() + ir1.numDocs()); Document doc = ir0.document(0); Assert.assertNotNull("Field: gram", doc.getField("gram")); Assert.assertNotNull("Field: freq", doc.getField("freq")); ir0.close();/* w ww . j a va 2 s . c o m*/ ir1.close(); // Search on the index Finder f = new Finder(index, jWeb1T); Assert.assertEquals(f.find("relax").size(), 3); Assert.assertEquals(f.find("couch").size(), 1); Assert.assertEquals(f.find("relax couch").size(), 1); Assert.assertEquals(f.find("couchdb").size(), 1); }
From source file:de.tudarmstadt.ukp.teaching.uima.nounDecompounding.web1t.LuceneIndexerTest.java
License:Open Source License
@Test public void testSearch() throws Exception { // Check if fields and all documents exists IndexReader ir = IndexReader.open(FSDirectory.open(targetIndex)); Assert.assertEquals("Number of documents", 2, ir.numDocs()); Document doc = ir.document(0); Assert.assertNotNull("Field: gram", doc.getField("gram")); Assert.assertNotNull("Field: freq", doc.getField("freq")); ir.close();/*from w w w. ja v a 2s .c om*/ // Search on the index IndexSearcher searcher = new IndexSearcher(FSDirectory.open(targetIndex)); QueryParser p = new QueryParser(Version.LUCENE_30, "token", new StandardAnalyzer(Version.LUCENE_30)); Query q = p.parse("gram:relax"); Assert.assertEquals("Hit count 'Relax'", 2, searcher.search(q, 100).totalHits); q = p.parse("gram:couch"); Assert.assertEquals("Hit count 'couch'", 1, searcher.search(q, 100).totalHits); q = p.parse("gram:relax AND gram:couch"); Assert.assertEquals("Hit count 'couch'", 1, searcher.search(q, 100).totalHits); q = p.parse("gram:couchdb"); Assert.assertEquals("Hit count 'couchdb'", 1, searcher.search(q, 100).totalHits); searcher.close(); }
From source file:de.unihildesheim.iw.cli.DumpIPCs.java
License:Open Source License
private void runMain(final String... args) throws IOException, BuildException { new CmdLineParser(this.cliParams); parseWithHelp(this.cliParams, args); // check, if files and directories are sane this.cliParams.check(); assert this.cliParams.idxReader != null; final int maxDoc = this.cliParams.idxReader.maxDoc(); if (maxDoc == 0) { LOG.error("Empty index."); return;//from w w w .j a v a2 s. c o m } final Parser ipcParser = new Parser(); ipcParser.separatorChar(this.cliParams.sep); ipcParser.allowZeroPad(this.cliParams.zeroPad); final DirectoryReader reader = DirectoryReader.open(FSDirectory.open(this.cliParams.idxDir.toPath())); final Builder idxReaderBuilder = new Builder(reader); Pattern rx_ipc = null; if (this.cliParams.ipc != null) { final IPCRecord ipc = ipcParser.parse(this.cliParams.ipc); final BooleanQuery bq = new BooleanQuery(); rx_ipc = Pattern.compile(ipc.toRegExpString(this.cliParams.sep)); if (LOG.isDebugEnabled()) { LOG.debug("IPC regExp: rx={} pat={}", ipc.toRegExpString(this.cliParams.sep), rx_ipc); } bq.add(new QueryWrapperFilter(IPCClassQuery.get(ipc, this.cliParams.sep)), Occur.MUST); bq.add(new QueryWrapperFilter( new IPCFieldFilter(new IPCFieldFilterFunctions.SloppyMatch(ipc), ipcParser)), Occur.MUST); idxReaderBuilder.queryFilter(new QueryWrapperFilter(bq)); } final IndexReader idxReader = idxReaderBuilder.build(); if (idxReader.numDocs() > 0) { final Terms terms = MultiFields.getTerms(idxReader, LUCENE_CONF.FLD_IPC); TermsEnum termsEnum = TermsEnum.EMPTY; BytesRef term; if (terms != null) { termsEnum = terms.iterator(termsEnum); term = termsEnum.next(); final int[] count = { 0, 0 }; // match, exclude while (term != null) { final String code = term.utf8ToString(); if (rx_ipc == null || (rx_ipc.matcher(code).matches())) { final IPCRecord record = ipcParser.parse(code); try { System.out.println(code + ' ' + record + " (" + record.toFormattedString() + ") " + '[' + record.toRegExpString('-') + ']'); } catch (final IllegalArgumentException e) { System.out.println(code + ' ' + "INVALID (" + code + ')'); } count[0]++; } else { if (LOG.isDebugEnabled()) { LOG.debug("Skip non matching IPC: {}", code); } count[1]++; } term = termsEnum.next(); } LOG.info("match={} skip={}", count[0], count[1]); } } else { LOG.info("No documents left after filtering."); } }
From source file:dk.defxws.fedoragsearch.server.Config.java
License:Open Source License
private void checkConfig() throws ConfigException { if (logger.isDebugEnabled()) logger.debug("fedoragsearch.properties=" + fgsProps.toString()); // Check for unknown properties, indicating typos or wrong property names String[] propNames = { "fedoragsearch.deployFile", "fedoragsearch.soapBase", "fedoragsearch.soapUser", "fedoragsearch.soapPass", "fedoragsearch.defaultNoXslt", "fedoragsearch.defaultGfindObjectsRestXslt", "fedoragsearch.defaultUpdateIndexRestXslt", "fedoragsearch.defaultBrowseIndexRestXslt", "fedoragsearch.defaultGetRepositoryInfoRestXslt", "fedoragsearch.defaultGetIndexInfoRestXslt", "fedoragsearch.mimeTypes", "fedoragsearch.maxPageSize", "fedoragsearch.defaultBrowseIndexTermPageSize", "fedoragsearch.defaultGfindObjectsHitPageSize", "fedoragsearch.defaultGfindObjectsSnippetsMax", "fedoragsearch.defaultGfindObjectsFieldMaxLength", "fedoragsearch.repositoryNames", "fedoragsearch.indexNames", "fedoragsearch.updaterNames", "fedoragsearch.searchResultFilteringModule", "fedoragsearch.searchResultFilteringType" }; //checkPropNames("fedoragsearch.properties", fgsProps, propNames); // Check rest stylesheets checkRestStylesheet("fedoragsearch.defaultNoXslt"); checkRestStylesheet("fedoragsearch.defaultGfindObjectsRestXslt"); checkRestStylesheet("fedoragsearch.defaultUpdateIndexRestXslt"); checkRestStylesheet("fedoragsearch.defaultBrowseIndexRestXslt"); checkRestStylesheet("fedoragsearch.defaultGetRepositoryInfoRestXslt"); checkRestStylesheet("fedoragsearch.defaultGetIndexInfoRestXslt"); // Check mimeTypes checkMimeTypes("fedoragsearch", fgsProps, "fedoragsearch.mimeTypes"); // Check resultPage properties try {//w w w .ja va2 s . co m maxPageSize = Integer.parseInt(fgsProps.getProperty("fedoragsearch.maxPageSize")); } catch (NumberFormatException e) { errors.append("\n*** maxPageSize is not valid:\n" + e.toString()); } try { defaultBrowseIndexTermPageSize = Integer .parseInt(fgsProps.getProperty("fedoragsearch.defaultBrowseIndexTermPageSize")); } catch (NumberFormatException e) { errors.append("\n*** defaultBrowseIndexTermPageSize is not valid:\n" + e.toString()); } try { defaultGfindObjectsHitPageSize = Integer .parseInt(fgsProps.getProperty("fedoragsearch.defaultGfindObjectsHitPageSize")); } catch (NumberFormatException e) { errors.append("\n*** defaultGfindObjectsHitPageSize is not valid:\n" + e.toString()); } try { defaultGfindObjectsSnippetsMax = Integer .parseInt(fgsProps.getProperty("fedoragsearch.defaultGfindObjectsSnippetsMax")); } catch (NumberFormatException e) { errors.append("\n*** defaultGfindObjectsSnippetsMax is not valid:\n" + e.toString()); } try { defaultGfindObjectsFieldMaxLength = Integer .parseInt(fgsProps.getProperty("fedoragsearch.defaultGfindObjectsFieldMaxLength")); } catch (NumberFormatException e) { errors.append("\n*** defaultGfindObjectsFieldMaxLength is not valid:\n" + e.toString()); } // Check updater properties String updaterProperty = fgsProps.getProperty("fedoragsearch.updaterNames"); if (updaterProperty == null) { updaterNameToProps = null; // No updaters will be created } else { updaterNameToProps = new Hashtable(); StringTokenizer updaterNames = new StringTokenizer(updaterProperty); while (updaterNames.hasMoreTokens()) { String updaterName = updaterNames.nextToken(); try { InputStream propStream = null; try { propStream = getResourceInputStream("/updater/" + updaterName + "/updater.properties"); } catch (ConfigException e) { errors.append("\n" + e.getMessage()); } Properties props = new Properties(); props.load(propStream); propStream.close(); //MIH convertProperties(props); if (logger.isInfoEnabled()) { logger.info( configName + "/updater/" + updaterName + "/updater.properties=" + props.toString()); } // Check properties String propsNamingFactory = props.getProperty("java.naming.factory.initial"); String propsProviderUrl = props.getProperty("java.naming.provider.url"); String propsConnFactory = props.getProperty("connection.factory.name"); String propsClientId = props.getProperty("client.id"); if (propsNamingFactory == null) { errors.append("\n*** java.naming.factory.initial not provided in " + configName + "/updater/" + updaterName + "/updater.properties"); } if (propsProviderUrl == null) { errors.append("\n*** java.naming.provider.url not provided in " + configName + "/updater/" + updaterName + "/updater.properties"); } if (propsConnFactory == null) { errors.append("\n*** connection.factory.name not provided in " + configName + "/updater/" + updaterName + "/updater.properties"); } if (propsClientId == null) { errors.append("\n*** client.id not provided in " + configName + "/updater/" + updaterName + "/updater.properties"); } updaterNameToProps.put(updaterName, props); } catch (IOException e) { errors.append("\n*** Error loading " + configName + "/updater/" + updaterName + ".properties:\n" + e.toString()); } } } // Check searchResultFilteringModule property searchResultFilteringModuleProperty = fgsProps.getProperty("fedoragsearch.searchResultFilteringModule"); if (searchResultFilteringModuleProperty != null && searchResultFilteringModuleProperty.length() > 0) { try { getSearchResultFiltering(); } catch (ConfigException e) { errors.append(e.getMessage()); } String searchResultFilteringTypeProperty = fgsProps .getProperty("fedoragsearch.searchResultFilteringType"); StringTokenizer srft = new StringTokenizer(""); if (searchResultFilteringTypeProperty != null) { srft = new StringTokenizer(searchResultFilteringTypeProperty); } int countTokens = srft.countTokens(); if (searchResultFilteringTypeProperty == null || countTokens == 0 || countTokens > 1) { errors.append("\n*** " + configName + ": fedoragsearch.searchResultFilteringType=" + searchResultFilteringTypeProperty + ": one and only one of 'presearch', 'insearch', 'postsearch' must be stated.\n"); } else { for (int i = 0; i < countTokens; i++) { String token = srft.nextToken(); if (!("presearch".equals(token) || "insearch".equals(token) || "postsearch".equals(token))) { errors.append("\n*** " + configName + ": fedoragsearch.searchResultFilteringType=" + searchResultFilteringTypeProperty + ": only 'presearch', 'insearch', 'postsearch' may be stated, not '" + token + "'.\n"); } } } } // Check repository properties Enumeration repositoryNames = repositoryNameToProps.keys(); while (repositoryNames.hasMoreElements()) { String repositoryName = (String) repositoryNames.nextElement(); Properties props = (Properties) repositoryNameToProps.get(repositoryName); if (logger.isDebugEnabled()) logger.debug(configName + "/repository/" + repositoryName + "/repository.properties=" + props.toString()); // Check for unknown properties, indicating typos or wrong property names String[] reposPropNames = { "fgsrepository.repositoryName", "fgsrepository.fedoraSoap", "fgsrepository.fedoraUser", "fgsrepository.fedoraPass", "fgsrepository.fedoraObjectDir", "fgsrepository.fedoraVersion", "fgsrepository.defaultGetRepositoryInfoResultXslt", "fgsrepository.trustStorePath", "fgsrepository.trustStorePass" }; //checkPropNames(configName+"/repository/"+repositoryName+"/repository.properties", props, reposPropNames); // Check repositoryName String propsRepositoryName = props.getProperty("fgsrepository.repositoryName"); if (!repositoryName.equals(propsRepositoryName)) { errors.append("\n*** " + configName + "/repository/" + repositoryName + ": fgsrepository.repositoryName must be=" + repositoryName); } // Check fedoraObjectDir // String fedoraObjectDirName = insertSystemProperties(props.getProperty("fgsrepository.fedoraObjectDir")); // File fedoraObjectDir = new File(fedoraObjectDirName); // if (fedoraObjectDir == null) { // errors.append("\n*** "+configName+"/repository/" + repositoryName // + ": fgsrepository.fedoraObjectDir=" // + fedoraObjectDirName + " not found"); // } // Check result stylesheets checkResultStylesheet("/repository/" + repositoryName, props, "fgsrepository.defaultGetRepositoryInfoResultXslt"); } // Check index properties Enumeration indexNames = indexNameToProps.keys(); while (indexNames.hasMoreElements()) { String indexName = (String) indexNames.nextElement(); Properties props = (Properties) indexNameToProps.get(indexName); if (logger.isDebugEnabled()) logger.debug(configName + "/index/" + indexName + "/index.properties=" + props.toString()); // Check for unknown properties, indicating typos or wrong property names String[] indexPropNames = { "fgsindex.indexName", "fgsindex.indexBase", "fgsindex.indexUser", "fgsindex.indexPass", "fgsindex.operationsImpl", "fgsindex.defaultUpdateIndexDocXslt", "fgsindex.defaultUpdateIndexResultXslt", "fgsindex.defaultGfindObjectsResultXslt", "fgsindex.defaultBrowseIndexResultXslt", "fgsindex.defaultGetIndexInfoResultXslt", "fgsindex.indexDir", "fgsindex.analyzer", "fgsindex.untokenizedFields", "fgsindex.defaultQueryFields", "fgsindex.snippetBegin", "fgsindex.snippetEnd", "fgsindex.maxBufferedDocs", "fgsindex.mergeFactor", "fgsindex.ramBufferSizeMb", "fgsindex.defaultWriteLockTimeout", "fgsindex.defaultSortFields", "fgsindex.uriResolver" }; //checkPropNames(configName+"/index/"+indexName+"/index.properties", props, indexPropNames); // Check indexName String propsIndexName = props.getProperty("fgsindex.indexName"); if (!indexName.equals(propsIndexName)) { errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.indexName must be=" + indexName); } // Check operationsImpl class String operationsImpl = props.getProperty("fgsindex.operationsImpl"); if (operationsImpl == null || operationsImpl.equals("")) { errors.append( "\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl must be set in " + configName + "/index/ " + indexName + ".properties"); } try { Class operationsImplClass = Class.forName(operationsImpl); try { GenericOperationsImpl ops = (GenericOperationsImpl) operationsImplClass .getConstructor(new Class[] {}).newInstance(new Object[] {}); } catch (InstantiationException e) { errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl=" + operationsImpl + ": instantiation error.\n" + e.toString()); } catch (IllegalAccessException e) { errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl=" + operationsImpl + ": instantiation error.\n" + e.toString()); } catch (InvocationTargetException e) { errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl=" + operationsImpl + ": instantiation error.\n" + e.toString()); } catch (NoSuchMethodException e) { errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl=" + operationsImpl + ": instantiation error.\n" + e.toString()); } } catch (ClassNotFoundException e) { errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.operationsImpl=" + operationsImpl + ": class not found.\n" + e); } // Check result stylesheets checkResultStylesheet("/index/" + indexName, props, "fgsindex.defaultUpdateIndexDocXslt"); checkResultStylesheet("/index/" + indexName, props, "fgsindex.defaultUpdateIndexResultXslt"); checkResultStylesheet("/index/" + indexName, props, "fgsindex.defaultGfindObjectsResultXslt"); checkResultStylesheet("/index/" + indexName, props, "fgsindex.defaultBrowseIndexResultXslt"); checkResultStylesheet("/index/" + indexName, props, "fgsindex.defaultGetIndexInfoResultXslt"); // Check indexDir String indexDir = insertSystemProperties(props.getProperty("fgsindex.indexDir")); File indexDirFile = new File(indexDir); if (indexDirFile == null) { errors.append("\n*** " + configName + "/index/" + indexName + " fgsindex.indexDir=" + indexDir + " must exist as a directory"); } // Check analyzer class for lucene and solr if (operationsImpl.indexOf("fgslucene") > -1 || operationsImpl.indexOf("fgssolr") > -1) { String analyzer = props.getProperty("fgsindex.analyzer"); if (analyzer == null || analyzer.equals("")) { analyzer = defaultAnalyzer; } try { Class analyzerClass = Class.forName(analyzer); try { Analyzer a = (Analyzer) analyzerClass.getConstructor(new Class[] {}) .newInstance(new Object[] {}); } catch (InstantiationException e) { errors.append("\n*** " + configName + "/index/" + indexName + " " + analyzer + ": fgsindex.analyzer=" + analyzer + ": instantiation error.\n" + e.toString()); } catch (IllegalAccessException e) { errors.append("\n*** " + configName + "/index/" + indexName + " " + analyzer + ": fgsindex.analyzer=" + analyzer + ": instantiation error.\n" + e.toString()); } catch (InvocationTargetException e) { errors.append("\n*** " + configName + "/index/" + indexName + " " + analyzer + ": fgsindex.analyzer=" + analyzer + ": instantiation error.\n" + e.toString()); } catch (NoSuchMethodException e) { errors.append("\n*** " + configName + "/index/" + indexName + " " + analyzer + ": fgsindex.analyzer=" + analyzer + ": instantiation error:\n" + e.toString()); } } catch (ClassNotFoundException e) { errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.analyzer=" + analyzer + ": class not found:\n" + e.toString()); } } // Add untokenizedFields property for lucene if (operationsImpl.indexOf("fgslucene") > -1) { String defaultUntokenizedFields = props.getProperty("fgsindex.untokenizedFields"); if (defaultUntokenizedFields == null) props.setProperty("fgsindex.untokenizedFields", ""); if (indexDirFile != null) { StringBuffer untokenizedFields = new StringBuffer( props.getProperty("fgsindex.untokenizedFields")); IndexReader ir = null; try { ir = IndexReader.open(FSDirectory.open(new File(indexDir)), true); int max = ir.numDocs(); if (max > 10) max = 10; for (int i = 0; i < max; i++) { Document doc = ir.document(i); for (ListIterator li = doc.getFields().listIterator(); li.hasNext();) { Field f = (Field) li.next(); if (!f.isTokenized() && f.isIndexed() && untokenizedFields.indexOf(f.name()) < 0) { untokenizedFields.append(" " + f.name()); } } } } catch (Exception e) { } props.setProperty("fgsindex.untokenizedFields", untokenizedFields.toString()); if (logger.isDebugEnabled()) logger.debug("indexName=" + indexName + " fgsindex.untokenizedFields=" + untokenizedFields); } } // Check defaultQueryFields - how can we check this? String defaultQueryFields = props.getProperty("fgsindex.defaultQueryFields"); // Use custom URIResolver if given //MIH: also check for solr if (operationsImpl.indexOf("fgslucene") > -1 || operationsImpl.indexOf("fgssolr") > -1) { Class uriResolverClass = null; String uriResolver = props.getProperty("fgsindex.uriResolver"); if (!(uriResolver == null || uriResolver.equals(""))) { try { uriResolverClass = Class.forName(uriResolver); try { URIResolverImpl ur = (URIResolverImpl) uriResolverClass.getConstructor(new Class[] {}) .newInstance(new Object[] {}); if (ur != null) { ur.setConfig(this); indexNameToUriResolvers.put(indexName, ur); } } catch (InstantiationException e) { errors.append("\n*** " + configName + "/index/" + indexName + " " + uriResolver + ": fgsindex.uriResolver=" + uriResolver + ": instantiation error.\n" + e.toString()); } catch (IllegalAccessException e) { errors.append("\n*** " + configName + "/index/" + indexName + " " + uriResolver + ": fgsindex.uriResolver=" + uriResolver + ": instantiation error.\n" + e.toString()); } catch (InvocationTargetException e) { errors.append("\n*** " + configName + "/index/" + indexName + " " + uriResolver + ": fgsindex.uriResolver=" + uriResolver + ": instantiation error.\n" + e.toString()); } catch (NoSuchMethodException e) { errors.append("\n*** " + configName + "/index/" + indexName + " " + uriResolver + ": fgsindex.uriResolver=" + uriResolver + ": instantiation error:\n" + e.toString()); } } catch (ClassNotFoundException e) { errors.append("\n*** " + configName + "/index/" + indexName + ": fgsindex.uriResolver=" + uriResolver + ": class not found:\n" + e.toString()); } } } } if (logger.isDebugEnabled()) logger.debug("configCheck configName=" + configName + " errors=" + errors.toString()); if (errors.length() > 0) throw new ConfigException(errors.toString()); }
From source file:drakkar.mast.retrieval.SVNContext.java
/** * {@inheritDoc}//from w ww. ja v a 2 s.c om */ public boolean loadIndex(File indexPath) throws IOException, IndexException { IndexReader reader = null; boolean flag = false; if (!indexPath.isDirectory() || !indexPath.exists() || indexPath == null || IndexReader.indexExists(FSDirectory.open(indexPath)) == false) { message = "Not found index in default index path"; OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE); throw new IndexException(message); } else { reader = IndexReader.open(FSDirectory.open(indexPath)); loadedDocs = reader.numDocs(); reader.close(); message = "Loading SVN index..."; OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE); this.notifyTaskProgress(INFORMATION_MESSAGE, message); try { Thread.sleep(2000); } catch (InterruptedException ex) { message = "Error loading index: " + ex.toString(); OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE); this.notifyTaskProgress(ERROR_MESSAGE, message); } message = "Total of documents of the index: " + loadedDocs; OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE); this.notifyTaskProgress(INFORMATION_MESSAGE, message); flag = true; this.notifyLoadedDocument(loadedDocs); } return flag; }
From source file:drakkar.mast.retrieval.SVNContext.java
/** * {@inheritDoc}/*from ww w.j a v a 2 s. c om*/ */ public boolean loadIndex() throws IndexException, IOException { IndexReader reader = null; File defaultFile = new File(this.defaultIndexPath); boolean flag = false; if (!defaultFile.isDirectory() || !defaultFile.exists() || defaultFile == null || IndexReader.indexExists(FSDirectory.open(defaultFile)) == false) { message = "Not found index in default index path"; OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE); throw new IndexException(message); } else { reader = IndexReader.open(FSDirectory.open(defaultFile)); loadedDocs = reader.numDocs(); reader.close(); message = "Loading SVN index..."; OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE); this.notifyTaskProgress(INFORMATION_MESSAGE, message); try { Thread.sleep(2000); } catch (InterruptedException ex) { message = "Error loading index: " + ex.toString(); OutputMonitor.printLine(message, OutputMonitor.ERROR_MESSAGE); this.notifyTaskProgress(ERROR_MESSAGE, message); } message = "Total of documents of the index: " + loadedDocs; OutputMonitor.printLine(message, OutputMonitor.INFORMATION_MESSAGE); this.notifyTaskProgress(INFORMATION_MESSAGE, message); flag = true; this.notifyLoadedDocument(loadedDocs); } return flag; }
From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.lucene.CorpusHighFreqTerms.java
License:Open Source License
/** * * Extracts the top n most frequent terms (by document frequency ) from an existing Lucene index * (the dir must be specified via args or via tfidfTester on * /resources/lite/configs/general.conf) in this case the Wikipedia corpus) and reports their * document frequency./*from w w w . j a va 2 s . co m*/ * * @param args * @throws java.lang.Exception */ public static void main(String[] args) throws Exception { try { Properties prop = new Properties(); InputStream is = new FileInputStream("resources/lite/configs/general.conf"); FSDirectory dir; if (args.length == 1) { if (Paths.get(args[0]).toFile().isDirectory()) { dir = FSDirectory.open(new File(args[0])); } else { System.out.println("The specified directory does not exist\n" + " backing to load the lucene index specified in the config files"); dir = FSDirectory.open(new File(prop.getProperty("tfidfTester"))); } } else if (args.length > 1) { System.out.println("The args only need one parameter, the directory of the Lucene Index\n " + "backing to load the lucene index specified in the config files"); dir = FSDirectory.open(new File(prop.getProperty("tfidfTester"))); } else { dir = FSDirectory.open(new File(prop.getProperty("tfidfTester"))); } IndexReader reader = null; String field = null; boolean IncludeTermFreqs = false; prop.load(is); IncludeTermFreqs = true; reader = DirectoryReader.open(dir); System.out.println("num Docs " + reader.numDocs()); TermStats[] terms = getHighFreqTerms(reader, numTerms, field); if (!IncludeTermFreqs) { //default HighFreqTerms behavior for (int i = 0; i < terms.length; i++) { System.out.printf("%s:%s %,d \n", terms[i].field, terms[i].termtext.utf8ToString(), terms[i].docFreq); } } else { TermStats[] termsWithTF = sortByTotalTermFreq(reader, terms); for (int i = 0; i < termsWithTF.length; i++) { System.out.printf("%s:%s \t totalTF = %,d \t doc freq = %,d \n", termsWithTF[i].field, termsWithTF[i].termtext.utf8ToString(), termsWithTF[i].totalTermFreq, termsWithTF[i].docFreq); } } reader.close(); } catch (Exception ex) { logger.error("The directory specified contains a Lucene index?", ex); } }
From source file:edu.illinois.cs.cogcomp.bigdata.lucene.Lucene.java
License:Open Source License
/** * uses custom similarity to compute idf, use this if you want to implement * IDF(numDocs,docFreq)// ww w . j a va2 s . com * * @param reader * @param field * @param tfidfSIM * @return * @throws IOException */ public static Map<String, Float> getIdfs(IndexReader reader, String field, TFIDFSimilarity tfidfSIM) throws IOException { Map<String, Float> docFrequencies = new HashMap<>(); TermsEnum termEnum = MultiFields.getTerms(reader, field).iterator(); BytesRef bytesRef; while ((bytesRef = termEnum.next()) != null) { if (termEnum.seekExact(bytesRef)) { String term = bytesRef.utf8ToString(); float idf = tfidfSIM.idf(termEnum.docFreq(), reader.numDocs()); docFrequencies.put(term, idf); } } return docFrequencies; }