List of usage examples for org.apache.lucene.benchmark.byTask.utils StreamUtils BUFFER_SIZE
int BUFFER_SIZE
To view the source code for org.apache.lucene.benchmark.byTask.utils StreamUtils BUFFER_SIZE.
Click Source Link
From source file:info.boytsov.lucene.MapAOLQueries.java
License:Open Source License
public static void main(String[] args) { if (args.length < 3) { printUsage();/* www . ja v a 2s.c om*/ System.exit(1); } boolean DEBUG = false; String srcDirName = args[0]; String srcFileName = args[1]; String dstFileName = args[2]; // using the same default value, so that we get consistent results int minTermFreq = DumpIndex.MIN_TERM_FREQ; int maxTermQty = DumpIndex.MAX_TERM_QTY; int optArg = 3; int minQuerySize = 1; if (args.length >= 4 && !args[3].startsWith("-")) { minTermFreq = Integer.parseInt(args[3]); optArg++; // using the same default value, so that we get consistent results if (args.length >= 5 && !args[4].startsWith("-")) { maxTermQty = Integer.parseInt(args[4]); optArg++; } } int sampleQty = -1; boolean ignoreSessionDuplicates = true; for (int i = optArg; i < args.length; ++i) { if (args[i].equals("-permit_sess_duppl")) { ignoreSessionDuplicates = true; } else if (args[i].equals("-debug")) { DEBUG = true; } else if (args[i].equals("-sample_qty")) { sampleQty = Integer.parseInt(args[++i]); } else if (args[i].equals("-min_query_size")) { minQuerySize = Integer.parseInt(args[++i]); } else { System.err.println("Wrong param: " + args[i]); printUsage(); System.exit(1); } } System.out .println("Source dir: " + srcDirName + " log file: " + srcFileName + " target dir: " + dstFileName); System.out.println("Min term freq: " + minTermFreq + " Max # of terms: " + maxTermQty); System.out.println("SampleQty:" + sampleQty); System.out.println("Ignore duplicates within a session: " + ignoreSessionDuplicates); try { IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName))); FreqWordDict dict = new FreqWordDict(reader, FIELD_NAME, minTermFreq, maxTermQty); if (DEBUG) { Iterator<Entry<TermDesc, Integer>> iter = dict.getTermIterator(); int termId = 0; while (iter.hasNext()) { Entry<TermDesc, Integer> e = iter.next(); TermDesc ts = e.getKey(); System.out.println(termId + ":" + ts.getText()); ++termId; } } File srcFile = new File(srcFileName); // supports either gzip, bzip2, or regular text file, // detects type by extension InputStream inputStream = StreamUtils.inputStream(srcFile); BufferedReader logReader = new BufferedReader(new InputStreamReader(inputStream), StreamUtils.BUFFER_SIZE); String line; String prevSessID = ""; HashSet<String> sessQueries = new HashSet<String>(); ArrayList<String> unparsedQueries = new ArrayList<String>(); CharArraySet stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; while ((line = logReader.readLine()) != null) { String parts[] = line.split("\t"); String sessID = parts[0]; // Ignore the header line if (sessID.equals("AnonID")) continue; String q = parts[1]; if (!sessID.equals(prevSessID)) { prevSessID = sessID; sessQueries.clear(); } if (ignoreSessionDuplicates && sessQueries.contains(q)) continue; if (ignoreSessionDuplicates) sessQueries.add(q); unparsedQueries.add(q); } logReader.close(); if (sampleQty == -1) sampleQty = unparsedQueries.size(); sampleQty = Math.min(sampleQty, unparsedQueries.size()); int unusedQty = unparsedQueries.size(); BitSet usedQueries = new BitSet(unusedQty); outWriter = new FileOutputStream(new File(dstFileName)); while (sampleQty > 0 && unusedQty > 0 && sampleQty <= unusedQty) { int qr = (int) Math.floor(Math.random() * unusedQty); int qn = 0; for (int i = 0; i < usedQueries.cardinality(); ++i) { if (usedQueries.get(i)) { ++qn; continue; } if (--qr < 0) { qn = i; break; } } if (usedQueries.get(qn)) { throw new Exception("Bug: selected an already used query!"); } String q = unparsedQueries.get(qn); String queryParts[] = q.split("\\s+"); String res = ""; int querySize = 0; for (String s : queryParts) { // We need to ignore stop words, but not the original queries if (stopWords.contains(s)) { continue; } else { ++querySize; } Integer pos = dict.getTermPos(s); if (pos == null) { res = ""; break; } String posStr = DEBUG ? s + ":" + pos : pos.toString(); res = res.isEmpty() ? posStr : res + " " + posStr; } if (!res.isEmpty() && querySize >= minQuerySize) { outWriter.write(res.getBytes()); outWriter.write('\n'); sampleQty--; } usedQueries.set(qn); unusedQty--; } if (sampleQty > 0) { throw new Exception("Failed to obtained a required number of queries, " + sampleQty + " are not found." + " Please, ask for fewer queries to be converted."); } } catch (Exception e) { System.err.println("Error: " + e.getMessage()); e.printStackTrace(); System.exit(1); } }
From source file:info.boytsov.lucene.parsers.TrecContentSource.java
License:Apache License
void openNextFile() throws NoMoreDataException, IOException { close();/* ww w. j a v a 2 s . c o m*/ currPathType = null; while (true) { if (nextFile >= inputFiles.size()) { // exhausted files, start a new round, unless forever set to false. if (!forever) { throw new NoMoreDataException(); } nextFile = 0; iteration++; } File f = inputFiles.get(nextFile++); if (verbose) { System.out.println("opening: " + f + " length: " + f.length()); } try { InputStream inputStream = StreamUtils.inputStream(f); // support either gzip, bzip2, or regular text file, by extension reader = new BufferedReader(new InputStreamReader(inputStream, encoding), StreamUtils.BUFFER_SIZE); currPathType = TrecDocParser.pathType(f); return; } catch (Exception e) { if (verbose) { System.out.println("Skipping 'bad' file " + f.getAbsolutePath() + " due to " + e.getMessage()); continue; } throw new NoMoreDataException(); } } }
From source file:parsers.TrecContentSource.java
License:Apache License
void openNextFile() throws NoMoreDataException, IOException { close();// w ww . j av a 2 s. co m currPathType = null; while (true) { if (nextFile >= inputFiles.size()) { // exhausted files, start a new round, unless forever set to false. if (!forever) { throw new NoMoreDataException(); } nextFile = 0; iteration++; } Path f = inputFiles.get(nextFile++); if (verbose) { System.out.println("opening: " + f + " length: " + f.toFile().length()); } try { InputStream inputStream = StreamUtils.inputStream(f); // support either gzip, bzip2, or regular text file, by extension reader = new BufferedReader(new InputStreamReader(inputStream, encoding), StreamUtils.BUFFER_SIZE); currPathType = TrecDocParser.pathType(f.toFile()); return; } catch (Exception e) { if (verbose) { System.out.println( "Skipping 'bad' file " + f.toFile().getAbsolutePath() + " due to " + e.getMessage()); continue; } throw new NoMoreDataException(); } } }