List of usage examples for org.apache.lucene.search IndexSearcher doc
public Document doc(int docID) throws IOException
.getIndexReader().document(docID)
From source file:com.redsqirl.SimpleSearcher.java
License:Open Source License
public List<String> searchIndex(File indexDir, String queryStr, int maxHits) throws Exception { Directory directory = FSDirectory.open(indexDir); IndexSearcher searcher = new IndexSearcher(directory); QueryParser parser = new QueryParser(Version.LUCENE_30, "contents", new SimpleAnalyzer()); Query query = parser.parse(queryStr); TopDocs topDocs = searcher.search(query, maxHits); List<String> list = new ArrayList<String>(); ScoreDoc[] hits = topDocs.scoreDocs; for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document d = searcher.doc(docId); logger.info(d.get("filename")); list.add(d.get("filename")); }/* w w w .j a v a 2s.co m*/ logger.info("Found " + hits.length); return list; }
From source file:com.ricky.codelab.lucene.LuceneIndexAndSearchDemo.java
License:Apache License
/** * /*from w ww . ja v a2 s . c o m*/ * ??? * @param args */ public static void main(String[] args) { //Lucene Document?? String fieldName = "text"; // String text = "IK Analyzer???????"; //IKAnalyzer? Analyzer analyzer = new IKAnalyzer(true); Directory directory = null; IndexWriter iwriter = null; IndexReader ireader = null; IndexSearcher isearcher = null; try { // directory = new RAMDirectory(); //?IndexWriterConfig IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); iwriter = new IndexWriter(directory, iwConfig); // Document doc = new Document(); doc.add(new StringField("ID", "10000", Field.Store.YES)); doc.add(new TextField(fieldName, text, Field.Store.YES)); iwriter.addDocument(doc); iwriter.close(); //?********************************** //? ireader = DirectoryReader.open(directory); isearcher = new IndexSearcher(ireader); String keyword = "?"; //QueryParser?Query QueryParser qp = new QueryParser(fieldName, analyzer); qp.setDefaultOperator(QueryParser.AND_OPERATOR); Query query = qp.parse(keyword); System.out.println("Query = " + query); //?5? TopDocs topDocs = isearcher.search(query, 5); System.out.println("" + topDocs.totalHits); // ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (int i = 0; i < topDocs.totalHits; i++) { Document targetDoc = isearcher.doc(scoreDocs[i].doc); System.out.println("" + targetDoc.toString()); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { e.printStackTrace(); } } if (directory != null) { try { directory.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:com.rubenlaguna.en4j.searchlucene.NoteFinderLuceneImpl.java
License:Open Source License
public Collection<Note> find(String searchText) { if ("".equals(searchText.trim())) { return Collections.EMPTY_LIST; }//from w w w . j a v a 2 s . c o m long start = System.currentTimeMillis(); searchText = searchText.trim(); String patternStr = "\\s+"; String replaceStr = "* "; Pattern pattern = Pattern.compile(patternStr); Matcher matcher = pattern.matcher(searchText); searchText = matcher.replaceAll(replaceStr); if (Pattern.matches(".*\\w$", searchText)) { searchText = searchText + "*"; } LOG.info("search text:" + searchText); final Collection<Note> toReturn = new ArrayList<Note>(); try { IndexReader newReader = reader.reopen(); if (newReader != reader) { reader.close(); } reader = newReader; LOG.info("using index version: " + reader.getVersion()); final IndexSearcher searcher = new IndexSearcher(reader); final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_29); QueryParser parser = new CustomQueryParser("all", analyzer); parser.setDefaultOperator(QueryParser.Operator.AND); Query query = parser.parse(searchText); LOG.info("query =" + query.toString()); //search the query Collector collector = new Collector() { private int docBase = 0; @Override public void setScorer(Scorer scorer) throws IOException { } @Override public void collect(int doc) throws IOException { int scoreId = doc + docBase; Document document = searcher.doc(scoreId); final String stringValue = document.getField("id").stringValue(); int docId = Integer.parseInt(stringValue); LOG.fine("doc id " + stringValue + " matches the search."); toReturn.add(nr.get(docId, false)); } @Override public void setNextReader(IndexReader reader, int docBase) throws IOException { this.docBase = docBase; } @Override public boolean acceptsDocsOutOfOrder() { return true; } }; searcher.search(query, collector); searcher.close(); } catch (ParseException ex) { Exceptions.printStackTrace(ex); } catch (CorruptIndexException ex) { Exceptions.printStackTrace(ex); } catch (IOException ex) { Exceptions.printStackTrace(ex); } catch (IllegalStateException ex) { LOG.info("caught " + ex.getMessage() + ". Most likely the app is shutting down"); } long delta = System.currentTimeMillis() - start; Installer.mbean.sampleSearchTime(delta); LOG.info("find took " + delta / 1000.0 + " secs. " + toReturn.size() + " results found"); return toReturn; }
From source file:com.searchcode.app.service.CodeSearcher.java
License:Open Source License
/** * Only used as fallback if getByRepoFileName fails for some reason due to what appears to be a lucene index bug * this should always work as the path used is sha1 and should be unique for anything the current codebase can * deal with//from w ww .j a v a 2s. c o m */ public CodeResult getByCodeId(String codeId) { CodeResult codeResult = null; try { IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(this.INDEXPATH))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new CodeAnalyzer(); QueryParser parser = new QueryParser(CODEFIELD, analyzer); Query query = parser.parse(Values.CODEID + ":" + QueryParser.escape(codeId)); Singleton.getLogger().info("Query to get by " + Values.CODEID + ":" + QueryParser.escape(codeId)); TopDocs results = searcher.search(query, 1); ScoreDoc[] hits = results.scoreDocs; if (hits.length != 0) { Document doc = searcher.doc(hits[0].doc); String filepath = doc.get(Values.PATH); List<String> code = new ArrayList<>(); try { code = Singleton.getHelpers() .readFileLinesGuessEncoding(filepath, Singleton.getHelpers().tryParseInt( Properties.getProperties().getProperty(Values.MAXFILELINEDEPTH, Values.DEFAULTMAXFILELINEDEPTH), Values.DEFAULTMAXFILELINEDEPTH)); } catch (Exception ex) { Singleton.getLogger().info("Indexed file appears to binary: " + filepath); } codeResult = new CodeResult(code, null); codeResult.setFilePath(filepath); codeResult.setCodePath(doc.get(Values.FILELOCATIONFILENAME)); codeResult.setFileName(doc.get(Values.FILENAME)); codeResult.setLanguageName(doc.get(Values.LANGUAGENAME)); codeResult.setMd5hash(doc.get(Values.MD5HASH)); codeResult.setCodeLines(doc.get(Values.CODELINES)); codeResult.setDocumentId(hits[0].doc); codeResult.setRepoName(doc.get(Values.REPONAME)); codeResult.setRepoLocation(doc.get(Values.REPOLOCATION)); codeResult.setCodeOwner(doc.get(Values.CODEOWNER)); codeResult.setCodeId(doc.get(Values.CODEID)); } reader.close(); } catch (Exception ex) { LOGGER.severe(" caught a " + ex.getClass() + "\n with message: " + ex.getMessage()); } return codeResult; }
From source file:com.searchcode.app.service.CodeSearcher.java
License:Open Source License
public ProjectStats getProjectStats(String repoName) { int totalCodeLines = 0; int totalFiles = 0; List<CodeFacetLanguage> codeFacetLanguages = new ArrayList<>(); List<CodeFacetOwner> repoFacetOwners = new ArrayList<>(); List<CodeFacetLanguage> codeByLines = new ArrayList<>(); SearchcodeLib searchcodeLib = Singleton.getSearchCodeLib(); try {/*from w w w .j av a 2 s . c om*/ IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(this.INDEXPATH))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new CodeAnalyzer(); QueryParser parser = new QueryParser(CODEFIELD, analyzer); Query query = parser.parse(Values.REPONAME + ":" + repoName); TopDocs results = searcher.search(query, Integer.MAX_VALUE); ScoreDoc[] hits = results.scoreDocs; Map<String, Integer> linesCount = new HashMap<>(); for (int i = 0; i < results.totalHits; i++) { Document doc = searcher.doc(hits[i].doc); if (!searchcodeLib.languageCostIgnore(doc.get(Values.LANGUAGENAME))) { int lines = Singleton.getHelpers().tryParseInt(doc.get(Values.CODELINES), "0"); totalCodeLines += lines; String languageName = doc.get(Values.LANGUAGENAME).replace("_", " "); if (linesCount.containsKey(languageName)) { linesCount.put(languageName, linesCount.get(languageName) + lines); } else { linesCount.put(languageName, lines); } } } for (String key : linesCount.keySet()) { codeByLines.add(new CodeFacetLanguage(key, linesCount.get(key))); } codeByLines.sort((a, b) -> b.getCount() - a.getCount()); totalFiles = results.totalHits; codeFacetLanguages = this.getLanguageFacetResults(searcher, reader, query); repoFacetOwners = this.getOwnerFacetResults(searcher, reader, query); reader.close(); } catch (Exception ex) { LOGGER.severe("CodeSearcher getProjectStats caught a " + ex.getClass() + "\n with message: " + ex.getMessage()); } return new ProjectStats(totalCodeLines, totalFiles, codeFacetLanguages, codeByLines, repoFacetOwners); }
From source file:com.searchcode.app.service.CodeSearcher.java
License:Open Source License
/** * Due to very large repositories (500,000 files) this needs to support * paging. Also need to consider the fact that is a list of strings * TODO maybe convert to hash so lookups are faster *///www.j a v a2 s.co m public List<String> getRepoDocuments(String repoName, int page) { int REPOPAGELIMIT = 1000; List<String> fileLocations = new ArrayList<>(REPOPAGELIMIT); int start = REPOPAGELIMIT * page; try { IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(this.INDEXPATH))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new CodeAnalyzer(); QueryParser parser = new QueryParser(CODEFIELD, analyzer); Query query = parser.parse(Values.REPONAME + ":" + repoName); TopDocs results = searcher.search(query, Integer.MAX_VALUE); int end = Math.min(results.totalHits, (REPOPAGELIMIT * (page + 1))); ScoreDoc[] hits = results.scoreDocs; for (int i = start; i < end; i++) { Document doc = searcher.doc(hits[i].doc); fileLocations.add(doc.get(Values.PATH)); } reader.close(); } catch (Exception ex) { LOGGER.severe("CodeSearcher getRepoDocuments caught a " + ex.getClass() + " on page " + page + "\n with message: " + ex.getMessage()); } return fileLocations; }
From source file:com.searchcode.app.service.CodeSearcher.java
License:Open Source License
/** * Only really used internally but does the heavy lifting of actually converting the index document on disk to the * format used internally including reading the file from disk. *///from w w w. j av a 2s . co m public SearchResult doPagingSearch(IndexReader reader, IndexSearcher searcher, Query query, int page) throws IOException { TopDocs results = searcher.search(query, 20 * this.PAGELIMIT); // 20 pages worth of documents ScoreDoc[] hits = results.scoreDocs; int numTotalHits = results.totalHits; int start = this.PAGELIMIT * page; int end = Math.min(numTotalHits, (this.PAGELIMIT * (page + 1))); int noPages = numTotalHits / this.PAGELIMIT; if (noPages > 20) { noPages = 19; } List<Integer> pages = this.calculatePages(numTotalHits, noPages); List<CodeResult> codeResults = new ArrayList<>(); for (int i = start; i < end; i++) { Document doc = searcher.doc(hits[i].doc); String filepath = doc.get(Values.PATH); if (filepath != null) { // This line is occasionally useful for debugging ranking, but not useful enough to have as log info //System.out.println("doc=" + hits[i].doc + " score=" + hits[i].score); List<String> code = new ArrayList<>(); try { // This should probably be limited by however deep we are meant to look into the file // or the value we use here whichever is less code = Singleton.getHelpers() .readFileLinesGuessEncoding(filepath, Singleton.getHelpers().tryParseInt( Properties.getProperties().getProperty(Values.MAXFILELINEDEPTH, Values.DEFAULTMAXFILELINEDEPTH), Values.DEFAULTMAXFILELINEDEPTH)); } catch (Exception ex) { LOGGER.warning("Indexed file appears to binary or missing: " + filepath); } CodeResult cr = new CodeResult(code, null); cr.setCodePath(doc.get(Values.FILELOCATIONFILENAME)); cr.setFileName(doc.get(Values.FILENAME)); cr.setLanguageName(doc.get(Values.LANGUAGENAME)); cr.setMd5hash(doc.get(Values.MD5HASH)); cr.setCodeLines(doc.get(Values.CODELINES)); cr.setDocumentId(hits[i].doc); cr.setRepoLocation(doc.get(Values.REPOLOCATION)); cr.setRepoName(doc.get(Values.REPONAME)); cr.setCodeOwner(doc.get(Values.CODEOWNER)); cr.setCodeId(doc.get(Values.CODEID)); codeResults.add(cr); } else { LOGGER.warning((i + 1) + ". " + "No path for this document"); } } List<CodeFacetLanguage> codeFacetLanguages = this.getLanguageFacetResults(searcher, reader, query); List<CodeFacetRepo> repoFacetLanguages = this.getRepoFacetResults(searcher, reader, query); List<CodeFacetOwner> repoFacetOwner = this.getOwnerFacetResults(searcher, reader, query); return new SearchResult(numTotalHits, page, query.toString(), codeResults, pages, codeFacetLanguages, repoFacetLanguages, repoFacetOwner); }
From source file:com.searchcode.app.service.TimeCodeSearcher.java
/** * Attempts to find a unique file given the repository name and the path/filename however * it seems to randomly not find things for some files. No idea of the root cause at this point and have implemented * a work around where we get the file by getById which is no ideal. The bug appears to be due to some issue * inside lucene itself as using raw queries to pull back the file results in no matches, and yet it does appear * when not limiting to the repo//from w w w .j av a 2 s.com * TODO investigate the lucene issue that occurs here mentioned above * TODO needs to use the revision number here as well to get the right value */ public CodeResult getByRepoFileName(String repo, String fileName) { CodeResult codeResult = null; try { IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(this.INDEXPATH))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new CodeAnalyzer(); QueryParser parser = new QueryParser(CODEFIELD, analyzer); // TODO I have a feeling this may not be unique if there are to files in the same directory with different case... something to investigate Query query = parser .parse(Values.FILELOCATIONFILENAME + ":" + QueryParser.escape(repo + "/" + fileName)); Singleton.getLogger().info("Query to get by filename = " + Values.FILELOCATIONFILENAME + ":" + QueryParser.escape(repo + "/" + fileName)); TopDocs results = searcher.search(query, 1); ScoreDoc[] hits = results.scoreDocs; if (hits.length != 0) { Document doc = searcher.doc(hits[0].doc); String filepath = doc.get(Values.PATH); List<String> code = new ArrayList<>(); try { code = Files.readAllLines(Paths.get(filepath), StandardCharsets.UTF_8); code = Singleton.getHelpers() .readFileLines(filepath, Singleton.getHelpers().tryParseInt( Properties.getProperties().getProperty(Values.MAXFILELINEDEPTH, Values.DEFAULTMAXFILELINEDEPTH), Values.DEFAULTMAXFILELINEDEPTH)); } catch (Exception ex) { Singleton.getLogger().info("Indexed file appears to binary: " + filepath); } codeResult = new CodeResult(code, null); codeResult.setCodePath(doc.get(Values.FILELOCATIONFILENAME)); codeResult.setFileName(doc.get(Values.FILENAME)); codeResult.setLanguageName(doc.get(Values.LANGUAGENAME)); codeResult.setMd5hash(doc.get(Values.MD5HASH)); codeResult.setCodeLines(doc.get(Values.CODELINES)); codeResult.setDocumentId(hits[0].doc); codeResult.setRepoName(doc.get(Values.REPONAME)); codeResult.setRepoLocation(doc.get(Values.REPOLOCATION)); codeResult.setCodeOwner(doc.get(Values.CODEOWNER)); } reader.close(); } catch (Exception ex) { LOGGER.severe(" caught a " + ex.getClass() + "\n with message: " + ex.getMessage()); } return codeResult; }
From source file:com.searchcode.app.service.TimeCodeSearcher.java
public List<String> getRepoDocuments(String repoName) { List<String> fileLocations = new ArrayList<>(); try {//from ww w . ja v a 2 s . c o m IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(this.INDEXPATH))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new CodeAnalyzer(); QueryParser parser = new QueryParser(CODEFIELD, analyzer); Query query = parser.parse(Values.REPONAME + ":" + repoName); TopDocs results = searcher.search(query, Integer.MAX_VALUE); ScoreDoc[] hits = results.scoreDocs; for (int i = 0; i < hits.length; i++) { Document doc = searcher.doc(hits[i].doc); fileLocations.add(doc.get(Values.FILELOCATIONFILENAME)); } reader.close(); } catch (Exception ex) { LOGGER.severe(" caught a " + ex.getClass() + "\n with message: " + ex.getMessage()); } return fileLocations; }
From source file:com.searchcode.app.service.TimeCodeSearcher.java
/** * Only really used internally but does the heavy lifting of actually converting the index document on disk to the * format used internally including reading the file from disk. *//* w w w . ja va 2 s. co m*/ public SearchResult doPagingSearch(IndexReader reader, IndexSearcher searcher, Query query, int page) throws IOException { TopDocs results = searcher.search(query, 20 * this.PAGELIMIT); // 20 pages worth of documents ScoreDoc[] hits = results.scoreDocs; int numTotalHits = results.totalHits; int start = this.PAGELIMIT * page; int end = Math.min(numTotalHits, (this.PAGELIMIT * (page + 1))); int noPages = numTotalHits / this.PAGELIMIT; if (noPages > 20) { noPages = 20; } List<Integer> pages = new ArrayList<>(); for (int i = 0; i < noPages; i++) { pages.add(i); } List<CodeResult> codeResults = new ArrayList<>(); for (int i = start; i < end; i++) { Document doc = searcher.doc(hits[i].doc); String filepath = doc.get(Values.PATH); if (filepath != null) { // This line is occasionally useful for debugging ranking, but not useful enough to have as log info //System.out.println("doc=" + hits[i].doc + " score=" + hits[i].score); CodeResult cr = new CodeResult(null, null); cr.setCodePath(doc.get(Values.FILELOCATIONFILENAME)); cr.setFileName(doc.get(Values.FILENAME)); cr.setLanguageName(doc.get(Values.LANGUAGENAME)); cr.setMd5hash(doc.get(Values.MD5HASH)); cr.setCodeLines(doc.get(Values.CODELINES)); cr.setDocumentId(hits[i].doc); cr.setRepoLocation(doc.get(Values.REPOLOCATION)); cr.setRepoName(doc.get(Values.REPONAME)); cr.setCodeOwner(doc.get(Values.CODEOWNER)); cr.setRevision(doc.get(Values.REVISION)); cr.setYearMonthDay(doc.get(Values.DATEYEARMONTHDAY)); cr.setMessage(doc.get(Values.MESSAGE)); cr.setDeleted(doc.get(Values.DELETED)); try { // This should probably be limited by however deep we are meant to look into the file // or the value we use here whichever is less String repoLoc = "./repo/" + cr.getRepoName() + "/.git"; cr.setCode(Arrays.asList(gitService .fetchFileRevision(repoLoc, cr.getRevision(), cr.getCodePath()).split("\\r?\\n"))); } catch (Exception ex) { LOGGER.warning("Indexed file appears to binary or missing: " + filepath); } codeResults.add(cr); } else { LOGGER.warning((i + 1) + ". " + "No path for this document"); } } List<CodeFacetLanguage> codeFacetLanguages = this.getLanguageFacetResults(searcher, reader, query); List<CodeFacetRepo> repoFacetLanguages = this.getRepoFacetResults(searcher, reader, query); List<CodeFacetOwner> repoFacetOwner = this.getOwnerFacetResults(searcher, reader, query); List<CodeFacetYearMonthDay> repoFacetYearMonthDay = this.getYearMonthDayFacetResults(searcher, reader, query); List<CodeFacetYearMonth> repoFacetYearMonth = this.getYearMonthFacetResults(searcher, reader, query); List<CodeFacetYear> repoFacetYear = this.getYearFacetResults(searcher, reader, query); List<CodeFacetRevision> repoFacetRevision = this.getRevisionFacetResults(searcher, reader, query); List<CodeFacetDeleted> repoFacetDeleted = this.getDeletedFacetResults(searcher, reader, query); SearchResult searchResult = new SearchResult(numTotalHits, page, query.toString(), codeResults, pages, codeFacetLanguages, repoFacetLanguages, repoFacetOwner); searchResult.setRepoFacetYearMonthDay(repoFacetYearMonthDay); searchResult.setRepoFacetYearMonth(repoFacetYearMonth); searchResult.setRepoFacetYear(repoFacetYear); searchResult.setRepoFacetRevision(repoFacetRevision); searchResult.setRepoFacetDeleted(repoFacetDeleted); return searchResult; }