List of usage examples for org.apache.lucene.index IndexReader numDocs
public abstract int numDocs();
From source file:edu.utah.bmi.ibiomes.catalog.MetadataLookup.java
License:Open Source License
/** * Get all standard attributes from the dictionary * @return List of standard metadata attributes * @throws IOException /* www. j a v a 2s . c o m*/ * @throws CorruptIndexException */ public MetadataAttributeList getAllMetadataAttributes() throws CorruptIndexException, IOException { logger.info("Loading list of standard metadata attributes"); MetadataAttributeList attrs = new MetadataAttributeList(); IndexReader reader = DirectoryReader.open(FSDirectory.open(indexFile)); for (int d = 0; d < reader.numDocs(); d++) { Document doc = reader.document(d); MetadataAttribute attribute = getAttributeFromDocument(doc); attrs.add(attribute); } return attrs; }
From source file:edu.utsa.sifter.IndexResource.java
License:Apache License
@Path("index") @POST/*from w w w . java 2 s .c o m*/ @Consumes({ MediaType.APPLICATION_JSON }) @Produces({ MediaType.APPLICATION_JSON }) public IndexInfo openIndex(IndexInfo idx) { if (idx.Id == null) { idx.Id = new String(Hex.encodeHex(Hasher.digest(idx.Path.getBytes()))); } idx.Id = idx.Id.toLowerCase(); IndexReader rdr = State.Indices.get(idx.Id); if (rdr == null) { try { final File evPath = new File(idx.Path); final File primaryIdx = new File(evPath, "primary-idx"); final File somIdx = new File(evPath, "som-idx"); DirectoryReader parallel[] = new DirectoryReader[2]; parallel[0] = DirectoryReader.open(FSDirectory.open(primaryIdx)); parallel[1] = DirectoryReader.open(FSDirectory.open(somIdx)); rdr = new ParallelCompositeReader(parallel); } catch (IOException ex) { HttpResponse.setStatus(HttpServletResponse.SC_NOT_FOUND); } } if (rdr != null) { idx.NumDocs = rdr.numDocs(); State.Indices.put(idx.Id, rdr); State.IndexLocations.put(idx.Id, idx); } return idx; }
From source file:edu.wayne.cs.severe.ir4se.lucene.SearchFiles.java
License:Apache License
@SuppressWarnings("deprecation") public static void search(String system, int documentType, int queryNumber, String indexDirectoryPath, String queryString, String fileOutput, String[] targetClasses, boolean runIndividualTerms, boolean append) throws Exception { String index = indexDirectoryPath; FileWriter f = new FileWriter(index + "../NotFound.txt", true); for (int i = 0; i < targetClasses.length; i++) { String target = targetClasses[i]; boolean found = Indexer.isFileDocInIndex(indexDirectoryPath, target); if (!found) f.append("Target doc " + i + " - " + target + " not found in index!\n"); }/*from www . j ava2 s . c o m*/ f.close(); IndexReader reader = IndexReader.open(FSDirectory.open(new File(index)), true); int numDocs = reader.numDocs(); System.out.println("The number of documents in the index is: " + numDocs); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35); String[] fields; fields = new String[1]; fields[0] = "contents"; if (!runIndividualTerms) { MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_35, fields, analyzer); int hitsPerPage = numDocs; TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); Query query = parser.parse(queryString); searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; System.out.println("The number of hits is: " + hits.length); // file with the results (score and position) only for the relevant // documents // the file contains entries in the following format: // (queryNumber,relDoc1,posRelDoc1,scoreRelDoc1,relDoc2,posRelDoc2,scoreRelDoc2,...) FileWriter fwRelevant = new FileWriter(fileOutput, append); String path = ""; String docName = ""; String docPathAndName = ""; for (String target : targetClasses) { boolean found = false; for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document d = searcher.doc(docId); path = d.get("path"); float score = hits[i].score; if (documentType == 2) { docName = d.get("docName"); docPathAndName = path.toLowerCase() + "." + docName.toLowerCase(); if (target.equalsIgnoreCase(docPathAndName)) { fwRelevant.write(system + ";" + queryNumber + ";" + target + ";" + (i + 1) + ";" + hits.length + ";" + numDocs + ";" + score + "\n"); found = true; break; } } else if (documentType == 1) { File pathDir = new File(path.trim()); String fileName = pathDir.getName(); docName = fileName.replaceAll(".txt", ""); fwRelevant.write((i + 1) + ". doc = " + docName + " score = " + score + "\n"); } } if (found == false) fwRelevant.write(system + ";" + queryNumber + ";" + target + "; NOT_RETRIEVED" + "\n"); } // fw.close(); fwRelevant.close(); reader.close(); } else // runIndividualTerms = true { /** * each query will be divided in its constituent terms and each term * will be run as a separate query **/ /** * this is useful to determine the similarity of each of the terms * in a query to a target document so that we determine which terms * in the query tend to lead to the best results, i.e., to finding * the targets sooner **/ SearchFiles.search(system, documentType, queryNumber, indexDirectoryPath, queryString, fileOutput.replaceAll(".txt", "_wholeQuery.txt"), targetClasses, false, append); FileWriter fw = new FileWriter(fileOutput.replaceAll(".txt", "_terms.txt")); fw.write( "\n\n\n------------------------------------------------------------------------------------\n\n"); fw.write(" Results for query " + queryNumber + "\n"); fw.write("------------------------------------------------------------------------------------\n\n"); // file with the results (score and position) only for the relevant // documents // the file contains entries in the following format: // (queryNumber,term1,term1TF,term1DF,relDoc1,posRelDoc1Term1,scoreRelDoc1Term1,relDoc2,posRelDoc2Term1,scoreRelDoc2Term1,...) // (queryNumber,term2,term2TF,term2DF,relDoc1,posRelDoc1Term2,scoreRelDoc1Term2,relDoc2,posRelDoc2Term2,scoreRelDoc2Term2,...) // ... FileWriter fwRelevant = new FileWriter( fileOutput.replaceAll(".txt", "_terms_RelevantDocsPositions.txt")); String[] queryTerms = queryString.split(" "); for (int l = 0; l < queryTerms.length; l++) { MultiFieldQueryParser parser = new MultiFieldQueryParser(Version.LUCENE_35, fields, analyzer); int hitsPerPage = numDocs; TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); String q = queryTerms[l]; Query query = parser.parse(q); searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; fw.write("TERM " + (l + 1) + ": " + q + "\n\n"); fwRelevant.write("\n" + queryNumber + "," + q); for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document d = searcher.doc(docId); String path = d.get("path"); float score = hits[i].score; if (documentType == 2) { String docName = d.get("docName"); fw.write((i + 1) + ". doc = " + path + " " + docName + " - score = " + score + "\n"); for (int k = 0; k < targetClasses.length; k++) { if (docName.equalsIgnoreCase(targetClasses[k])) { String contents = d.get("contents"); int frequency = countOccurrences(contents, q);// tf fwRelevant.write("," + frequency); fwRelevant.write("," + reader.docFreq(new Term("contents", q)));// df fwRelevant.write("," + path + "." + docName + "," + (i + 1) + "," + score); break; } } } else if (documentType == 1) { File pathDir = new File(path); String fileName = pathDir.getName(); String docName = fileName.replaceAll(".txt", ""); fw.write((i + 1) + ". doc = " + docName + " score = " + score + "\n"); } } fw.write("\n\n\n"); } fw.close(); f.close(); fwRelevant.close(); reader.close(); } }
From source file:es.pode.indexador.negocio.servicios.busqueda.SrvBuscadorServiceImpl.java
License:Open Source License
/** * @see es.pode.indexador.negocio.servicios.busqueda.SrvBuscadorService#busquedaAvanzada(es.pode.indexador.negocio.servicios.busqueda.ParamAvanzadoVO) * @param ParamAvanzadoVO VO que alberga los parametros que acepta una busqueda avanzada. * @return DocumentosVO Esta clase representa los resultados de una busqueda. *//* ww w .ja v a2 s . c o m*/ protected es.pode.indexador.negocio.servicios.busqueda.DocumentosVO handleBusquedaAvanzada( es.pode.indexador.negocio.servicios.busqueda.ParamAvanzadoVO paramBusq) throws java.lang.Exception { // Implementamos la busqueda avanzada. DocumentosVO respuesta = new DocumentosVO(); if (logger.isDebugEnabled()) logger.debug("SrvBuscadorServiceImpl - handleBusquedaAvanzada: Parametros de la busqueda avanzada " + paramBusqAvanz2String(paramBusq)); DisjunctionMaxQuery query = new DisjunctionMaxQuery(0.01f); long start = System.currentTimeMillis(); Object[] hits = null; boolean resultadoUnico = false; // Some fixing with "*" and ".*" if (paramBusq.getPalabrasClave() != null && !paramBusq.getPalabrasClave().trim().equals("*") && !paramBusq.getPalabrasClave().trim().equals(".*")) { if (paramBusq.getPalabrasClave().trim().toLowerCase() .startsWith(props.getProperty("agrega_admin") + " ")) { hits = new Hits[1]; hits = this.internaBusquedaQuery(paramBusq, paramBusq.getPalabrasClave().toLowerCase().trim() .substring((props.getProperty("agrega_admin") + " ").length()), query, false, null); } else if (paramBusq.getPalabrasClave().trim().toLowerCase() .startsWith(props.getProperty("agrega_todos") + " ")) { IndiceVO[] idiomas = this.handleObtenerIdiomasBusqueda(); hits = new Hits[idiomas.length]; hits = this.internaBusquedaQuery(paramBusq, paramBusq.getPalabrasClave().toLowerCase().trim() .substring((props.getProperty("agrega_todos") + " ").length()), query, true, idiomas); } else { resultadoUnico = true; hits = new Hits[1]; hits[0] = internaBusquedaAvanzada(paramBusq, query); } } else { resultadoUnico = true; hits = new Hits[1]; hits[0] = internaBusquedaAvanzada(paramBusq, query); } long end = System.currentTimeMillis(); logger.debug("SrvBuscadorServiceImpl - handleBusquedaAvanzada : Busqueda local realizada en =" + (end - start) + " milisegundos."); if (logger.isDebugEnabled()) { logger.debug("Se devuelven los hits"); logger.debug("Se devuelven los hits NULL o 0 vamos a por las sugerencias"); } if (paramBusq.getBusquedaSimpleAvanzada() != null && !paramBusq.getBusquedaSimpleAvanzada().equals(BUSCARRSS)) { try { Directory directorioIndiceSpell = this.indiceSpell(paramBusq.getIdiomaBusqueda()); if (logger.isDebugEnabled()) logger.debug("El indice de spellchecker es " + directorioIndiceSpell); SpellChecker spellChecker = new SpellChecker(directorioIndiceSpell); String sQuery = query.toString(); // TODO: retocar las sugerencias para que tengan en cuenta algo mas que los parametros de "keywords"? if (!spellChecker.exist(sQuery) && (paramBusq.getPalabrasClave() != null && !paramBusq.getPalabrasClave().equals(""))) { List claveBusqueda = this.obtenerPalabrasClave(paramBusq.getPalabrasClave().toLowerCase(), false); List<String> sugerencias = new ArrayList<String>(); if (claveBusqueda != null && claveBusqueda.size() > 0) { boolean suficientes = false; for (int i = 0; i < claveBusqueda.size() && !suficientes; i++) { if (!((String) claveBusqueda.get(i)).equals("")) { String[] suge = spellChecker.suggestSimilar((String) claveBusqueda.get(i), NUMERO_SUGERENCIAS); if (suge != null && suge.length > 0) { for (int k = 0; k < suge.length && sugerencias.size() < NUMERO_SUGERENCIAS; k++) { boolean encontrado = false; for (int j = 0; j < sugerencias.size() && !encontrado; j++) { if (sugerencias.get(j).toString().equals(suge[k])) encontrado = true; } if (!encontrado && validarPersonalizada(paramBusq)) { Hits hitSugerencias = null; ParamAvanzadoVO paramBusqSug = paramBusq; paramBusqSug.setPalabrasClave(suge[k]); try { hitSugerencias = internaBusquedaAvanzada(paramBusqSug, query); if (hitSugerencias != null && hitSugerencias.length() > 0) sugerencias.add(suge[k]); } catch (Exception e) { logger.error( "SrvBuscadorServiceImpl - handleBuscarAvanzado:Error solicitando comprobaci n sugerencia avanzada. Sugerencia=" + suge[k], e); } } else if (!encontrado && !validarPersonalizada(paramBusq)) sugerencias.add(suge[k]); } } if (sugerencias.size() == NUMERO_SUGERENCIAS) suficientes = true; } } } String[] cargarSugerencias = new String[] {}; if (sugerencias != null && sugerencias.size() > 0) { cargarSugerencias = new String[sugerencias.size()]; for (int i = 0; i < sugerencias.size(); i++) { cargarSugerencias[i] = sugerencias.get(i); } } respuesta.setSugerencias(cargarSugerencias); } else respuesta.setSugerencias(new String[] {}); } catch (Exception e) { logger.error( "SrvBuscadorServiceImpl - handleBuscarAvanzado:Error solicitando sugerencias para idioma:" + paramBusq.getIdiomaBusqueda(), e); respuesta.setSugerencias(new String[] {}); } try { es.pode.indexador.negocio.servicios.busqueda.TaxonVO[] cargarTesauros = new es.pode.indexador.negocio.servicios.busqueda.TaxonVO[] {}; if (paramBusq.getPalabrasClave() != null && !paramBusq.getPalabrasClave().trim().equals("")) { List palabrasTesauro = this.obtenerPalabrasClave(paramBusq.getPalabrasClave().toLowerCase(), true); List<String> nombreTesauros = new ArrayList<String>(); List<String> identificadorTesauros = new ArrayList<String>(); if (palabrasTesauro != null && palabrasTesauro.size() > 0) { int numeroTax = 0; for (int i = 0; i < palabrasTesauro.size() && (numeroTax < Integer.parseInt(props.getProperty("numero_tesauros"))); i++) { TaxonVO[] taxones = this.getSrvTesaurosServices().obtenerTerminosRelacionadosPorTexto( (String) palabrasTesauro.get(i), props.getProperty("nombre_tesauro"), paramBusq.getIdiomaBusqueda()); String[] idTesauro = new String[taxones.length]; for (int k = 0; k < taxones.length; k++) { idTesauro[k] = taxones[k].getId(); } for (int k = 0; k < taxones.length && (numeroTax < Integer.parseInt(props.getProperty("numero_tesauros"))); k++) { Integer[] tesauros = NumTermsArbol .obtenerNumeroNodos(idTesauro, getIndexPathByLanguage(paramBusq.getIdiomaBusqueda()), "tesauro") .getConteo(); if (idTesauro != null && idTesauro.length != 0) { for (int j = 0; j < idTesauro.length; j++) { if (idTesauro[j].equals(taxones[k].getId())) { if (tesauros[j].intValue() > 0) { nombreTesauros.add(taxones[k].getValorTax()); identificadorTesauros.add(taxones[k].getId()); numeroTax = numeroTax + 1; } } } } } } } if (nombreTesauros != null && nombreTesauros.size() > 0) { cargarTesauros = new es.pode.indexador.negocio.servicios.busqueda.TaxonVO[nombreTesauros .size()]; for (int i = 0; i < nombreTesauros.size(); i++) { cargarTesauros[i] = new es.pode.indexador.negocio.servicios.busqueda.TaxonVO( identificadorTesauros.get(i).toString(), nombreTesauros.get(i).toString()); } } respuesta.setTesauros(cargarTesauros); } else respuesta.setTesauros(new es.pode.indexador.negocio.servicios.busqueda.TaxonVO[] {}); } catch (Exception e) { logger.error( "SrvBuscadorServiceImpl - handleBuscarAvanzado:Error obteniendo sugerencias de tesauro [" + props.getProperty("nombre_tesauro") + "] con:" + paramBusq.getPalabrasClave() + " n mero de tesauros m ximo solicitado=" + props.getProperty("numero_tesauros") + " e idioma=" + paramBusq.getIdiomaBusqueda(), e); respuesta.setTesauros(new es.pode.indexador.negocio.servicios.busqueda.TaxonVO[] {}); } } if (hits == null || (resultadoUnico && hits[0] == null)) { respuesta.setTotalResultados(new Integer(0)); respuesta.setNumeroResultados(new Integer(0)); respuesta.setNumDocumentosIndice(new Integer(0)); } else { long start2 = System.currentTimeMillis(); if (hits.length > 1) { int totalResultados = 0; List docsList = new ArrayList(); for (int i = 0; i < hits.length && docsList.size() <= paramBusq.getNumeroResultados().intValue(); i++) { if (hits[i] != null && ((Hits) hits[i]).length() > 0) { totalResultados = totalResultados + ((Hits) hits[i]).length(); DocVO[] docs = this.getArrayDocsFromHits((Hits) hits[i], ((((Hits) hits[i]).length() < paramBusq.getNumeroResultados().intValue()) || paramBusq.getNumeroResultados().intValue() == -1) ? ((Hits) hits[i]).length() : paramBusq.getNumeroResultados().intValue()); for (int j = 0; j < docs.length; j++) { docsList.add(docs[j]); } } } DocVO[] docs = new DocVO[docsList.size()]; for (int i = 0; i < docs.length; i++) { docs[i] = (DocVO) docsList.get(i); } respuesta.setTotalResultados(new Integer(totalResultados)); respuesta.setResultados(docs); } else { respuesta.setTotalResultados(new Integer(((Hits) hits[0]).length())); respuesta.setResultados(this.getArrayDocsFromHits((Hits) hits[0], ((((Hits) hits[0]).length() < paramBusq.getNumeroResultados().intValue()) || paramBusq.getNumeroResultados().intValue() == -1) ? ((Hits) hits[0]).length() : paramBusq.getNumeroResultados().intValue())); } end = System.currentTimeMillis(); logger.debug("SrvBuscadorServiceImpl - handleBusquedaAvanzada : Mapeo local realizado en =" + (end - start2) + " milisegundos."); IndexReader reader = IndexReader.open(this.getIndexByLanguage(paramBusq.getIdiomaBusqueda())); respuesta.setNumDocumentosIndice(new Integer(reader.numDocs())); respuesta.setNumeroResultados(new Integer(respuesta.getResultados().length)); } logger.debug("SrvBuscadorServiceImpl - handleBusquedaAvanzada : Busqueda local realizada en =" + (end - start) + " milisegundos."); return respuesta; }
From source file:es.pode.indexador.negocio.servicios.busqueda.SrvBuscadorServiceImpl.java
License:Open Source License
/** * Este metodo busca un ODE al azar de dentro del repositorio. * @return DocVO Detalle de un ODE indexado. *//* ww w. j a va2 s. c om*/ protected DocVO handleObtenerODERandom() throws Exception { List listaIndices = (List) this.getIndiceDao().loadAll(getIndiceDao().TRANSFORM_INDICEVO); if (listaIndices.size() == 0)// No hay indices que listar, no devuelvo nada return null; Random random = new Random(Calendar.getInstance().getTimeInMillis()); Document doc = null; boolean noCero = true; int intRandom = random.nextInt(); int i = 0; int reintentosInt = 10; //Puede que intRandom sea 0, para que no sea as haremos 10 intentos como mucho for (i = 0; i < reintentosInt && intRandom != 0 && noCero; i++) { // Sacamos el indice aleatoriamente de todos los indices del repositorio int idiomaSeleciconado = (intRandom < 0 ? (intRandom * (-1)) : intRandom) % listaIndices.size(); IndiceVO indice = (IndiceVO) listaIndices.get(idiomaSeleciconado); // Abrimos el indice y vemos el numero de documentos indexados Directory directorioIndiceSimple = null; directorioIndiceSimple = this.getIndexByLanguage(indice.getIdentificador()); IndexReader indiceLectura = IndexReader.open(directorioIndiceSimple); int numeroDocumentos = indiceLectura.numDocs(); logger.debug("El numero de documentos del indice es " + numeroDocumentos); // Seleccionamos el documento que vamos a extraer if (numeroDocumentos > 0) { intRandom = random.nextInt(); noCero = false; int documentoSeleccionado = (intRandom < 0 ? (intRandom * (-1)) : intRandom) % numeroDocumentos; logger.info("Devuelto documento [" + documentoSeleccionado + "] de [" + numeroDocumentos + "] documentos totales indexados."); doc = indiceLectura.document(documentoSeleccionado); } indiceLectura.close(); } if (i == reintentosInt && noCero) { logger.info("No se ha encontrado ning n random v lido en [" + reintentosInt + "] intentos"); } if (doc != null) return getVOFromLucene(doc, new DocVO(), 0); else return null; }
From source file:eu.eexcess.sourceselection.redde.dbsampling.DBSampler.java
License:Apache License
/** * Estimates the database size of general database using sample-resample and * search term "term".// w ww. j a v a 2s . c o m * * @param term * one-term search term for general and sampled index * @return the estimated database size of the general index * @throws ParseException * @throws IOException * @throws IllegalArgumentException * if an index (base or sampled) contains no documents */ private double resample(String term) throws ParseException, IOException, IllegalArgumentException { Query query = new QueryParser(Settings.IndexFields.IndexTextField, new EnglishAnalyzer()).parse(term); outIndexWriter.commit(); if (inIndexReader.numDocs() <= 0) { throw new IllegalArgumentException("failed to resample using empty index [inIndexReader]"); } else if (outIndexWriter.numDocs() <= 0) { throw new IllegalArgumentException("failed to resample using empty index [outIndexWriter]"); } double estimation = 0; IndexReader sampleIndexReader = null; try { // get total hits for term in sample index sampleIndexReader = DirectoryReader.open(outIndexWriter, true); IndexSearcher sampleIndexSearcher = new IndexSearcher(sampleIndexReader); TopDocs sampleSearchDocs = sampleIndexSearcher.search(query, sampleIndexReader.numDocs()); // get total hits for term in general index IndexSearcher generalIndexSearcher = new IndexSearcher(inIndexReader); TopDocs generalSearchDocs = generalIndexSearcher.search(query, inIndexReader.numDocs()); estimation = estimationCalculator(generalSearchDocs.totalHits, sampleSearchDocs.totalHits, sampleIndexReader.numDocs(), true); } finally { if (sampleIndexReader != null) { sampleIndexReader.close(); } } return estimation; }
From source file:eu.eexcess.sourceselection.redde.Redde.java
License:Apache License
private ScoreDoc[] scoreDatabase(String queryString, IndexReader database) throws ParseException, IOException { Query query = new QueryParser(Settings.IndexFields.IndexTextField, new EnglishAnalyzer()) .parse(queryString);//from w ww. j av a 2s .c om IndexSearcher searcher = new IndexSearcher(database); TopDocs topDocs = searcher.search(query, database.numDocs()); return topDocs.scoreDocs; }
From source file:fr.ericlab.sondy.algo.eventdetection.EDCoW.java
License:Open Source License
@Override public ObservableList<DetectionResult> apply() { try {/*w w w .java 2s . co m*/ if (parameters.get(0).getValue() != null && !parameters.get(0).getValue().equals("")) { minTermSupport = Double.parseDouble(parameters.get(0).getValue()); } if (parameters.get(1).getValue() != null && !parameters.get(1).getValue().equals("")) { maxTermSupport = Double.parseDouble(parameters.get(1).getValue()); } if (parameters.get(2).getValue() != null && !parameters.get(2).getValue().equals("")) { delta = Integer.parseInt(parameters.get(2).getValue()); } if (parameters.get(3).getValue() != null && !parameters.get(3).getValue().equals("")) { gamma = Integer.parseInt(parameters.get(3).getValue()); } if (parameters.get(4).getValue() != null && !parameters.get(4).getValue().equals("")) { delta2 = Integer.parseInt(parameters.get(4).getValue()); } long startNanoTime = System.nanoTime(); int intervals = appVariables.messageSet.nbTimeSlice; int windows = intervals / delta2; events = new LinkedList<>(); termDocsMap = new HashMap<>(); IndexAccess indexAccess = new IndexAccess(appVariables); IndexReader r = indexAccess.reader; TermEnum allTerms = r.terms(); int minTermOccur = (int) (minTermSupport * appVariables.nbMessages), maxTermOccur = (int) (maxTermSupport * appVariables.nbMessages); while (allTerms.next()) { String term = allTerms.term().text(); if (term.length() > 1 && !appVariables.isStopWord(term)) { TermDocs termDocs = r.termDocs(allTerms.term()); float frequencyf[] = indexAccess.getTermFrequency(appVariables, termDocs); float cf = frequencyf[r.numDocs()]; if (cf > minTermOccur && cf < maxTermOccur) { termDocsMap.put(term, frequencyf); } } } indexAccess.close(); for (int i = 0; i < windows; i++) { processWindow(i); } Collections.sort(events); results = FXCollections.observableArrayList(); float intervalDuration = ((float) appVariables.getCurrentDatasetInterval()) / 60; for (EDCoWEvent ev : events) { double interval[] = ev.getInterval(intervalDuration); results.add(new DetectionResult(ev.getKeywordsAsString(), formatter.format(interval[0]) + ";" + formatter.format(interval[1]))); } long endNanoTime = System.nanoTime(); long elapsedNanoTime = endNanoTime - startNanoTime; double elaspedSecondTime = (double) elapsedNanoTime / (double) 1000000000; appVariables.addLogEntry("[event detection] computed EDCoW, minTermSupport=" + minTermSupport + ", maxTermSupport=" + maxTermSupport + ", delta=" + delta + ", gamma=" + gamma + ". " + results.size() + " results in " + formatter.format(elaspedSecondTime) + "s"); return results; } catch (NumberFormatException | IOException ex) { Logger.getLogger(EDCoW.class.getName()).log(Level.SEVERE, null, ex); } return null; }
From source file:fr.ericlab.sondy.algo.eventdetection.ET.java
License:Open Source License
public ObservableList<DetectionResult> apply() { try {/*from ww w . j ava2 s . co m*/ if (parameters.get(0).getValue() != null && !parameters.get(0).getValue().equals("")) { minTermSupport = Double.parseDouble(parameters.get(0).getValue()); } if (parameters.get(1).getValue() != null && !parameters.get(1).getValue().equals("")) { maxTermSupport = Double.parseDouble(parameters.get(1).getValue()); } if (parameters.get(2).getValue() != null && !parameters.get(2).getValue().equals("")) { gamma = Double.parseDouble(parameters.get(2).getValue()); } if (parameters.get(3).getValue() != null && !parameters.get(3).getValue().equals("")) { alpha = Double.parseDouble(parameters.get(3).getValue()); } long startNanoTime = System.nanoTime(); HashMap<String, Bursts> mapBursts = new HashMap<>(); HashMap<String, LinkedList<String>> mapCooccurences = new HashMap<>(); LinkedList<String> bigrams = new LinkedList<>(); int minTermOccur = (int) (minTermSupport * appVariables.nbMessages); int maxTermOccur = (int) (maxTermSupport * appVariables.nbMessages); DBAccess dbAccess = new DBAccess(); dbAccess.initialize(appVariables, false); DataManipulation dataManipulation = new DataManipulation(); IndexAccess indexAccess = new IndexAccess(appVariables); IndexReader r = indexAccess.reader; int intervalNumber = r.numDocs(); TermEnum allTerms = r.terms(); LinkedList<Frequency> frequencies = new LinkedList<>(); while (allTerms.next()) { String k = allTerms.term().text(); if (!appVariables.isStopWord(k)) { TermDocs termDocs = r.termDocs(allTerms.term()); float frequency[] = indexAccess.getTermFrequency(appVariables, termDocs); float cf = frequency[intervalNumber]; if (cf > minTermOccur && cf < maxTermOccur) { bigrams.add(k); frequencies.add(new Frequency(k, frequency, cf)); } } } HashSet<String> bigramSet = new HashSet<>(); bigramSet.addAll(bigrams); // Pre-computing AF total in all time-slices double AFtotal[] = new double[intervalNumber]; for (int a = 0; a < intervalNumber; a++) { double AFa = 0; for (int b = 0; b < bigrams.size(); b++) { AFa += frequencies.get(b).AF[a]; } AFtotal[a] = AFa; } int[] distribution = dataManipulation.getDistribution(appVariables); // Computing PF(kj_i) for all bigrams (j) in all time-slices (i) for (int j = 0; j < bigrams.size(); j++) { String k = bigrams.get(j); Frequency fr = frequencies.get(j); float AF[] = fr.AF; for (int i = 1; i < intervalNumber; i++) { double AFk = AF[i]; double PFk = AFk / AFtotal[i]; if (PFk > gamma) { // Storing bursty intervals Bursts bursts; if (mapBursts.get(k) != null) { bursts = mapBursts.get(k); } else { bursts = new Bursts(); } // Calculating the increase double h = PFk - frequencies.get(j).AF[i - 1] / distribution[i - 1]; if (h > 0) { bursts.list.add(new Burst(i, h)); mapBursts.put(k, bursts); } } } Bursts bursts = mapBursts.get(k); String tweets = ""; if (bursts != null) { for (Burst burst : bursts.list) { tweets += dbAccess.getMessagesAsString(appVariables, k, burst.U); } } LinkedList<String> FCB = getFrequentBigrams(tweets, bigramSet); mapCooccurences.put(k, FCB); } // Freeing memory frequencies = null; indexAccess.close(); mapBursts = getSortedMapDesc(mapBursts); // Clustering keywords String strEvents = performHAC(appVariables, bigrams, mapBursts, mapCooccurences); long endNanoTime = System.nanoTime(); long elapsedNanoTime = endNanoTime - startNanoTime; double elaspedSecondTime = (double) elapsedNanoTime / (double) 1000000000; readEventsFromString(strEvents); appVariables.addLogEntry("[event detection] computed ET, minTermSupport=" + minTermSupport + ", maxTermSupport=" + maxTermSupport + ", gamma=" + gamma + ", alpha=" + alpha + ". " + results.size() + " results in " + formatter.format(elaspedSecondTime) + "s"); return results; } catch (IOException ex) { Logger.getLogger(ET.class.getName()).log(Level.SEVERE, null, ex); } return null; }
From source file:fr.ericlab.sondy.algo.eventdetection.PeakyTopics.java
License:Open Source License
public ObservableList<DetectionResult> apply() { try {// ww w. j a va 2 s . c o m if (parameters.get(0).getValue() != null && !parameters.get(0).getValue().equals("")) { minTermSupport = Double.parseDouble(parameters.get(0).getValue()); } if (parameters.get(1).getValue() != null && !parameters.get(1).getValue().equals("")) { maxTermSupport = Double.parseDouble(parameters.get(1).getValue()); } if (parameters.get(2).getValue() != null && !parameters.get(2).getValue().equals("")) { smooth = Integer.parseInt(parameters.get(2).getValue()); } long startNanoTime = System.nanoTime(); DataManipulation dataManipulation = new DataManipulation(); IndexAccess indexAccess = new IndexAccess(appVariables); IndexReader r = indexAccess.reader; TermEnum allTerms = r.terms(); HashMap<DetectionResult, Float> score = new HashMap<>(); int intervalNumber = r.numDocs(); float intervalDuration = ((float) appVariables.getCurrentDatasetInterval()) / 60; int minTermOccur = (int) (minTermSupport * appVariables.nbMessages), maxTermOccur = (int) (maxTermSupport * appVariables.nbMessages); while (allTerms.next()) { String term = allTerms.term().text(); if (term.length() > 1 && !appVariables.isStopWord(term)) { TermDocs termDocs = r.termDocs(allTerms.term()); float frequency[] = indexAccess.getTermFrequency(appVariables, termDocs); float cf = frequency[intervalNumber]; if (cf > minTermOccur && cf < maxTermOccur) { if (smooth > 0) { frequency = dataManipulation.getSmoothedTermFrequency(frequency, smooth); } float tf = 0; int peakIndex = 0; for (int i = appVariables.startTimeSlice; i <= appVariables.endTimeSlice; i++) { if (frequency[i] > tf) { tf = frequency[i]; peakIndex = i; } } float peakDay = (peakIndex * intervalDuration) / 24; float peakDay1 = ((peakIndex + 1) * intervalDuration) / 24; score.put(new DetectionResult(term, formatter.format(peakDay) + ";" + formatter.format(peakDay1)), tf / cf); } } } indexAccess.close(); score = Collection.getSortedMapDesc(score); Set<Entry<DetectionResult, Float>> entrySet = score.entrySet(); results = FXCollections.observableArrayList(); for (Entry<DetectionResult, Float> entry : entrySet) { results.add(0, entry.getKey()); } long endNanoTime = System.nanoTime(); long elapsedNanoTime = endNanoTime - startNanoTime; double elaspedSecondTime = (double) elapsedNanoTime / (double) 1000000000; appVariables.addLogEntry("[event detection] computed peaky topics, minTermSupport=" + minTermSupport + ", maxTermSupport=" + maxTermSupport + ". " + results.size() + " results in " + formatter.format(elaspedSecondTime) + "s"); return results; } catch (IOException ex) { Logger.getLogger(PeakyTopics.class.getName()).log(Level.SEVERE, null, ex); return null; } }