List of usage examples for org.apache.lucene.index IndexReader close
@Override public final synchronized void close() throws IOException
From source file:engine.easy.search.RelevanceFeedBackUtil.java
License:Apache License
/** * This method will perform the thumbs down action. And generate the new * query based on top specific highest terms. It also decrease the relevant * document boost so that their ranking is lower in search results for the * similar terms.// w w w . j a v a 2 s . co m */ public static Query performPesduoRelevance(Result[] results) { Query q = null; try { final Map<String, Integer> frequencyMap = new HashMap<String, Integer>(); Map<Integer, Document> documentMap = new HashMap<Integer, Document>(); List<String> termsList = new ArrayList<String>(); Directory indexDir = FSDirectory.open(new File(AppConstants.INDEX_DIR_PATH)); IndexReader indexReader = IndexReader.open(indexDir); EasySearchIndexReader esiReader = new EasySearchIndexReader(indexReader); float boost = 0F; for (Result result : results) { TermFreqVector tfv = indexReader.getTermFreqVector(result.id, AppConstants.CONTENT_FIELD); Document doc = indexReader.document(result.id); boost += doc.getBoost() + AppConstants.THUMBS_UP; System.out.print("DOC : " + result.id + " Field : " + tfv.getField() + "\n"); for (int i = 0; i < tfv.getTermFrequencies().length; i++) { if (!termsList.contains(tfv.getTerms()[i])) termsList.add(tfv.getTerms()[i]); frequencyMap.put(tfv.getTerms()[i], tfv.getTermFrequencies()[i]); } } //close the index reader; indexReader.close(); //Boost the terms visibility in documents, so these documents more frequently for specific search terms. q = computeTopTermQuery(termsList, frequencyMap, AppConstants.TOP_DOCUMENTS); q.setBoost(boost); System.out.print("Query boost : " + boost); } catch (Exception e) { System.out.println("Exception: performThumbsUp" + e.toString()); } return q; }
From source file:engine.easy.search.RelevanceFeedBackUtil.java
License:Apache License
/** * Computes a term frequency map for the overall index at the specified location. * Builds a Boolean OR query out of the "most frequent" terms in the index * and returns it. "Most Frequent" is defined as the terms whose frequencies * are greater than or equal to the topTermCutoff * the frequency of the top * term, where the topTermCutoff is number between 0 and 1. * /*w w w .j a v a2 s . c o m*/ * @param ramdir the directory where the index is created. * @return a Boolean OR query. * @throws Exception if one is thrown. */ private static Query computeTopTermQueryFromDataCollection(Directory ramdir, int numOf) throws Exception { final Map<String, Integer> frequencyMap = new HashMap<String, Integer>(); List<String> termlist = new ArrayList<String>(); IndexReader reader = IndexReader.open(ramdir); TermEnum terms = reader.terms(); while (terms.next()) { Term term = terms.term(); String termText = term.text(); int frequency = reader.docFreq(term); frequencyMap.put(termText, frequency); termlist.add(termText); } reader.close(); return computeTopTermQuery(termlist, frequencyMap, AppConstants.TOP_DOCUMENTS); }
From source file:engine.easy.search.RelevanceFeedBackUtil.java
License:Apache License
public static Query performUpAndDown(Map<Integer, Float> docMap) throws IOException { float boost = 0.0F; //String[] Ids = ids.split(","); Query q = null;//from w ww . j a v a 2s . c o m try { final Map<String, Integer> frequencyMap = new HashMap<String, Integer>(); Map<Integer, Document> documentMap = new HashMap<Integer, Document>(); List<String> termsList = new ArrayList<String>(); Directory indexDir = FSDirectory.open(new File(AppConstants.INDEX_DIR_PATH)); IndexReader indexReader = IndexReader.open(indexDir); EasySearchIndexReader esiReader = new EasySearchIndexReader(indexReader); for (Integer docId : docMap.keySet()) { TermFreqVector tfv = indexReader.getTermFreqVector(docId, AppConstants.CONTENT_FIELD); Document doc = indexReader.document(docId); System.out.print("DOC : " + docId + " Field : " + tfv.getField() + "\n"); for (int i = 0; i < tfv.getTermFrequencies().length; i++) { if (!termsList.contains(tfv.getTerms()[i])) termsList.add(tfv.getTerms()[i]); System.out.println("TERM : " + tfv.getTerms()[i] + " FREQ : " + tfv.getTermFrequencies()[i]); frequencyMap.put(tfv.getTerms()[i], tfv.getTermFrequencies()[i]); } // put the document with doc id. documentMap.put(docId, doc); } // close the index reader; indexReader.close(); // Boost the terms visibility in documents, so these documents more // frequently for specific search terms. q = computeTopTermQuery(termsList, frequencyMap, AppConstants.TOP_DOCUMENTS); q.setBoost(AppConstants.BOOST); } catch (Exception e) { System.out.println("Exception: performThumbsUp" + e.toString()); } return q; }
From source file:es.pode.indexador.negocio.servicios.busqueda.SrvBuscadorServiceImpl.java
License:Open Source License
/** * Este metodo busca un ODE al azar de dentro del repositorio. * @return DocVO Detalle de un ODE indexado. *//* w w w.ja v a 2s . c o m*/ protected DocVO handleObtenerODERandom() throws Exception { List listaIndices = (List) this.getIndiceDao().loadAll(getIndiceDao().TRANSFORM_INDICEVO); if (listaIndices.size() == 0)// No hay indices que listar, no devuelvo nada return null; Random random = new Random(Calendar.getInstance().getTimeInMillis()); Document doc = null; boolean noCero = true; int intRandom = random.nextInt(); int i = 0; int reintentosInt = 10; //Puede que intRandom sea 0, para que no sea as haremos 10 intentos como mucho for (i = 0; i < reintentosInt && intRandom != 0 && noCero; i++) { // Sacamos el indice aleatoriamente de todos los indices del repositorio int idiomaSeleciconado = (intRandom < 0 ? (intRandom * (-1)) : intRandom) % listaIndices.size(); IndiceVO indice = (IndiceVO) listaIndices.get(idiomaSeleciconado); // Abrimos el indice y vemos el numero de documentos indexados Directory directorioIndiceSimple = null; directorioIndiceSimple = this.getIndexByLanguage(indice.getIdentificador()); IndexReader indiceLectura = IndexReader.open(directorioIndiceSimple); int numeroDocumentos = indiceLectura.numDocs(); logger.debug("El numero de documentos del indice es " + numeroDocumentos); // Seleccionamos el documento que vamos a extraer if (numeroDocumentos > 0) { intRandom = random.nextInt(); noCero = false; int documentoSeleccionado = (intRandom < 0 ? (intRandom * (-1)) : intRandom) % numeroDocumentos; logger.info("Devuelto documento [" + documentoSeleccionado + "] de [" + numeroDocumentos + "] documentos totales indexados."); doc = indiceLectura.document(documentoSeleccionado); } indiceLectura.close(); } if (i == reintentosInt && noCero) { logger.info("No se ha encontrado ning n random v lido en [" + reintentosInt + "] intentos"); } if (doc != null) return getVOFromLucene(doc, new DocVO(), 0); else return null; }
From source file:es.pode.indexador.negocio.servicios.indexado.SrvIndexadorServiceImpl.java
License:Open Source License
/** * Generacin del repositorio que nos muestra las palabras sugeridas * @param directorioIndiceSimple Objeto directorio con informacin del directorio del repositorio de ndices * @param directorioIndiceSpell Objeto directorio con la informacin del directorio del repositorio de las palabras sugeridas * @throws IOException/* w ww . j a v a2 s.c om*/ * @throws Exception */ private synchronized void spellCheckerAdd(Directory directorioIndiceSimple, Directory directorioIndiceSpell) throws IOException, Exception { if (logger.isDebugEnabled()) logger.debug("Comprobamos el directorio del spellchecker = " + directorioIndiceSpell + " y el normal = " + directorioIndiceSimple); if (IndexReader.indexExists(directorioIndiceSimple)) { if (logger.isDebugEnabled()) logger.debug("El ndiceSimple " + directorioIndiceSimple + "existe y lo abrimos para leer."); IndexReader indexReader = IndexReader.open(directorioIndiceSimple); String field = props.getProperty("campo_titulo"); if (logger.isDebugEnabled()) logger.debug("Creamos un diccionario para el campo = " + field); Dictionary dictionary = new LuceneDictionary(indexReader, field); if (logger.isDebugEnabled()) logger.debug("Creamos el spellchecher[" + directorioIndiceSpell + "]"); SpellChecker spellChecker = new SpellChecker(directorioIndiceSpell); if (logger.isDebugEnabled()) logger.debug("Indexamos el diccionario de [" + directorioIndiceSimple + "] en el spell [" + directorioIndiceSpell + "]"); spellChecker.indexDictionary(dictionary); field = props.getProperty("campo_descripcion"); if (logger.isDebugEnabled()) logger.debug("Creamos un diccionario para el campo = " + field); dictionary = new LuceneDictionary(indexReader, field); spellChecker.indexDictionary(dictionary); indexReader.close(); directorioIndiceSpell.close(); } else { logger.error("No existe el indice en el directorio[" + directorioIndiceSimple + "]"); throw new Exception("No existe el ndice en el directorio = " + directorioIndiceSimple); } }
From source file:es.unizar.iaaa.crawler.butler.index.SearchFiles.java
License:Apache License
/** * Simple command-line based search demo. *///from w w w .j a v a 2 s. co m public ArrayList<SearchResult> search(String dir, String queryS) throws Exception { String index = dir + "index"; IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new EnglishAnalyzer(); // para cada necesidad se preprocesa y se lanza una query String line = queryS; line = line.trim(); // quiamos signos de puntuacion para el prefiltrado line = line.replace(".", " "); line = line.replace(",", " "); line = line.replace("(", " "); line = line.replace(")", " "); line = line.replace(";", " "); line = line.replace(":", " "); line = line.replace("-", " "); // quitamos palabrasvacias de signifcado y expanimos la consulta // semanticamente Query queryMultiple = null; BooleanQuery query = new BooleanQuery();// valor este de la caja de // consulta // Consulta generica sobre todos los indices if (!line.equals("")) { String[] fields = { "content" }; // DEBE APARECER EN EL TITULO ALGO SOBRE ESTO BooleanClause.Occur[] flags = { BooleanClause.Occur.SHOULD }; queryMultiple = MultiFieldQueryParser.parse(line, fields, flags, analyzer); query.add(queryMultiple, BooleanClause.Occur.SHOULD); } ArrayList<SearchResult> result = doPagingSearch(dir, searcher, query); reader.close(); return result; }
From source file:eu.eexcess.federatedrecommender.decomposer.PseudoRelevanceSourcesDecomposer.java
License:Open Source License
@Override public SecureUserProfile decompose(SecureUserProfileEvaluation inputSecureUserProfile) { FederatedRecommenderCore fCore = null; try {// www . ja v a 2 s .c o m fCore = FederatedRecommenderCore.getInstance(null); } catch (FederatedRecommenderException e) { logger.log(Level.SEVERE, "Error getting FederatedRecommenderCore,was perhabs not initialized correctly", e); } Set<String> keywords = new HashSet<String>(); for (ContextKeyword cKeyword : inputSecureUserProfile.contextKeywords) { keywords.add(cKeyword.text); } // tmpSUP.partnerList = inputSecureUserProfile.queryExpansionSourcePartner; List<PartnerBadge> tmpPartnerList = new ArrayList<PartnerBadge>(); for (PartnerBadge partnerBadge : inputSecureUserProfile.partnerList) { tmpPartnerList.add(partnerBadge); } inputSecureUserProfile.partnerList = inputSecureUserProfile.queryExpansionSourcePartner; PartnersFederatedRecommendations pFR = fCore.getPartnersRecommendations(inputSecureUserProfile); inputSecureUserProfile.partnerList = tmpPartnerList; Directory directory = new RAMDirectory(); Analyzer analyzer = new StopAnalyzer(Version.LUCENE_48); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer); IndexWriter writer = null; try { writer = new IndexWriter(directory, config); for (ResultList resultLists : pFR.getResults().values()) { for (Result result : resultLists.results) { addDoc(writer, result.description); addDoc(writer, result.title); } } writer.close(); IndexReader reader = DirectoryReader.open(directory); TermStats[] tStats = null; try { tStats = HighFreqTerms.getHighFreqTerms(reader, 20, "content", new DocFreqComparator()); } catch (Exception e) { logger.log(Level.SEVERE, "Could not open HighFreqTerms", e); } finally { reader.close(); } if (tStats != null) { for (TermStats termStats : tStats) { String utf8String = termStats.termtext.utf8ToString(); if (utf8String.length() > 4) if (!checkHighFreqTermsQuery(utf8String.toLowerCase(), keywords)) if (keywords.add(utf8String.toLowerCase())) { inputSecureUserProfile.contextKeywords.add(new ContextKeyword(utf8String, termStats.docFreq / 100.0, ExpansionType.EXPANSION)); } } } else logger.log(Level.SEVERE, "TermStats was null!"); } catch (IOException e) { logger.log(Level.SEVERE, "There was and error writing/reading the Index", e); } logger.log(Level.INFO, "Source Expansion: " + keywords.toString() + " Partners: " + inputSecureUserProfile.queryExpansionSourcePartner); return inputSecureUserProfile; }
From source file:eu.eexcess.federatedrecommender.evaluation.schloett.SchloettQueryExtraction.java
License:Open Source License
private static List<Interest> getKeyWordsFromHistoryLinks( HashMap<String, LinkedHashMap<String, Object>> hashMap, Object taskId) { Directory dir = new RAMDirectory(); Analyzer analyzer = new StandardAnalyzer(); IndexWriter writer = null;//from w w w .j a v a 2 s . co m if (hashMap != null) for (String keyset : hashMap.keySet()) { LinkedHashMap<String, Object> linkedHashMap = hashMap.get(keyset); if (linkedHashMap.get("task_id").equals(taskId)) if (linkedHashMap != null) { Object urlObject = linkedHashMap.get("url"); if (urlObject != null) if (!urlObject.toString().contains("http://de.wikipedia.org/wiki")) { URL url = null; IndexReader reader = null; try { reader = DirectoryReader.open(dir); } catch (IOException e4) { } IndexSearcher searcher = null; if (reader != null) searcher = new IndexSearcher(reader); TopDocs docs = null; if (searcher != null) { try { docs = searcher.search(new TermQuery(new Term("url", urlObject.toString())), 1); } catch (IOException e4) { // TODO Auto-generated catch block e4.printStackTrace(); } } if (docs != null && docs.totalHits > 0) { } else { try { // System.out // .println(urlObject.toString()); url = new URL(urlObject.toString()); } catch (MalformedURLException e3) { e3.printStackTrace(); } try { reader.close(); } catch (Exception e3) { } InputStream input = null; if (url != null) { try { input = url.openStream(); } catch (IOException e2) { // TODO Auto-generated catch block // System.out.println(e2); } if (input != null) { LinkContentHandler linkHandler = new LinkContentHandler(); BodyContentHandler textHandler = new BodyContentHandler( 10 * 1024 * 1024); ToHTMLContentHandler toHTMLHandler = new ToHTMLContentHandler(); TeeContentHandler teeHandler = new TeeContentHandler(linkHandler, textHandler, toHTMLHandler); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); HtmlParser parser = new HtmlParser(); try { parser.parse(input, teeHandler, metadata, parseContext); } catch (IOException | SAXException | TikaException e1) { System.out.println(urlObject.toString()); e1.printStackTrace(); } String string = textHandler.toString(); String docString = " "; String tagged = tagger.tagString(string.toLowerCase()); Pattern pattern = Pattern.compile("\\s\\w+(_NN|_NNS)"); Matcher matcher = pattern.matcher(tagged); while (matcher.find()) { // System.out // .println("macht: "+matcher.group()); if (!blackList.contains(matcher.group().replaceAll("_NN|_NNS", ""))) docString += matcher.group().replaceAll("_NN|_NNS", " ") + " "; } // System.out.println("#######"); // System.out.println(docString); // for (String string2 : // docString.split("\\s")) { // if(string2.length()>1) // System.out // .print("\""+string2+"\","); // } // System.out.println("#######"); Document doc = new Document(); doc.add(new TextField("content", docString, Store.YES)); doc.add(new StringField("url", urlObject.toString(), Store.YES)); try { IndexWriterConfig config = new IndexWriterConfig( Version.LUCENE_4_10_0, analyzer); writer = new IndexWriter(dir, config); writer.addDocument(doc); writer.close(); input.close(); } catch (IOException e) { // TODO Auto-generated catch // block e.printStackTrace(); } } } } } } } IndexReader reader = null; try { reader = DirectoryReader.open(dir); } catch (Exception e1) { // TODO Auto-generated catch block System.out.println(e1); } TermStats[] tStats = null; if (reader != null) try { tStats = HighFreqTerms.getHighFreqTerms(reader, 30, "content", new DocFreqComparator()); } catch (Exception e) { System.out.println(e); } finally { try { reader.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } List<Interest> keywordList = new ArrayList<Interest>(); System.out.println("Extraction: "); if (tStats != null) { for (TermStats termStats : tStats) { String utf8String = termStats.termtext.utf8ToString(); if (!blackList.contains(utf8String.toLowerCase())) { // System.out.println(docString); // for (String string2 : docString.split("\\s")) { // if(string2.length()>1) // System.out // .print("\""+string2+"\","); // } // System.out.println("#######"); System.out.print("\"" + utf8String.toLowerCase() + "\","); keywordList.add(new Interest(utf8String.toLowerCase())); // System.out.println(utf8String.toLowerCase() + " docFreq " // + termStats.docFreq + " TermFreq " // + termStats.totalTermFreq + " "+tagged); } } } System.out.println(); return keywordList; }
From source file:eu.eexcess.sourceselection.redde.dbsampling.DBSampler.java
License:Apache License
/** * Estimates the database size of general database using sample-resample and * search term "term"./* ww w . j a va2s .c om*/ * * @param term * one-term search term for general and sampled index * @return the estimated database size of the general index * @throws ParseException * @throws IOException * @throws IllegalArgumentException * if an index (base or sampled) contains no documents */ private double resample(String term) throws ParseException, IOException, IllegalArgumentException { Query query = new QueryParser(Settings.IndexFields.IndexTextField, new EnglishAnalyzer()).parse(term); outIndexWriter.commit(); if (inIndexReader.numDocs() <= 0) { throw new IllegalArgumentException("failed to resample using empty index [inIndexReader]"); } else if (outIndexWriter.numDocs() <= 0) { throw new IllegalArgumentException("failed to resample using empty index [outIndexWriter]"); } double estimation = 0; IndexReader sampleIndexReader = null; try { // get total hits for term in sample index sampleIndexReader = DirectoryReader.open(outIndexWriter, true); IndexSearcher sampleIndexSearcher = new IndexSearcher(sampleIndexReader); TopDocs sampleSearchDocs = sampleIndexSearcher.search(query, sampleIndexReader.numDocs()); // get total hits for term in general index IndexSearcher generalIndexSearcher = new IndexSearcher(inIndexReader); TopDocs generalSearchDocs = generalIndexSearcher.search(query, inIndexReader.numDocs()); estimation = estimationCalculator(generalSearchDocs.totalHits, sampleSearchDocs.totalHits, sampleIndexReader.numDocs(), true); } finally { if (sampleIndexReader != null) { sampleIndexReader.close(); } } return estimation; }
From source file:eyeskyhigh.lucene.demo.DeleteFiles.java
License:Apache License
/** Deletes documents from an index that do not contain a term. */ public static void main(String[] args) { String usage = "java org.apache.lucene.demo.DeleteFiles <unique_term>"; if (args.length == 0) { System.err.println("Usage: " + usage); System.exit(1);// w w w . j ava 2s . c o m } try { Directory directory = FSDirectory.getDirectory("index"); IndexReader reader = IndexReader.open(directory); Term term = new Term("path", args[0]); int deleted = reader.deleteDocuments(term); System.out.println("deleted " + deleted + " documents containing " + term); // one can also delete documents by their internal id: /* for (int i = 0; i < reader.maxDoc(); i++) { System.out.println("Deleting document with id " + i); reader.delete(i); }*/ reader.close(); directory.close(); } catch (Exception e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }