Example usage for org.apache.lucene.benchmark.byTask.utils StreamUtils inputStream

List of usage examples for org.apache.lucene.benchmark.byTask.utils StreamUtils inputStream

Introduction

In this page you can find the example usage for org.apache.lucene.benchmark.byTask.utils StreamUtils inputStream.

Prototype

public static InputStream inputStream(Path file) throws IOException 

Source Link

Document

Returns an InputStream over the requested file.

Usage

From source file:com.evi.knowledge.SimpleWikipediaSource.java

License:Apache License

/** Open the input stream. */
protected InputStream openInputStream() throws IOException {
    System.out.println("Opening input stream to: " + file);
    return StreamUtils.inputStream(file);
}

From source file:com.tamingtext.qa.WexWikiContentSource.java

License:Apache License

public BufferedReader getReader(File file) throws IOException {
    InputStream is = StreamUtils.inputStream(file);
    return new BufferedReader(new InputStreamReader(is, "UTF-8"));
}

From source file:info.boytsov.lucene.MapAOLQueries.java

License:Open Source License

public static void main(String[] args) {
    if (args.length < 3) {
        printUsage();//w  ww  .  ja  v  a 2 s  . c  o m
        System.exit(1);
    }

    boolean DEBUG = false;

    String srcDirName = args[0];
    String srcFileName = args[1];
    String dstFileName = args[2];

    // using the same default value, so that we get consistent results
    int minTermFreq = DumpIndex.MIN_TERM_FREQ;
    int maxTermQty = DumpIndex.MAX_TERM_QTY;
    int optArg = 3;
    int minQuerySize = 1;

    if (args.length >= 4 && !args[3].startsWith("-")) {
        minTermFreq = Integer.parseInt(args[3]);
        optArg++;

        // using the same default value, so that we get consistent results
        if (args.length >= 5 && !args[4].startsWith("-")) {
            maxTermQty = Integer.parseInt(args[4]);
            optArg++;
        }

    }

    int sampleQty = -1;
    boolean ignoreSessionDuplicates = true;

    for (int i = optArg; i < args.length; ++i) {
        if (args[i].equals("-permit_sess_duppl")) {
            ignoreSessionDuplicates = true;
        } else if (args[i].equals("-debug")) {
            DEBUG = true;
        } else if (args[i].equals("-sample_qty")) {
            sampleQty = Integer.parseInt(args[++i]);
        } else if (args[i].equals("-min_query_size")) {
            minQuerySize = Integer.parseInt(args[++i]);
        } else {
            System.err.println("Wrong param: " + args[i]);
            printUsage();
            System.exit(1);
        }
    }

    System.out
            .println("Source dir: " + srcDirName + " log file: " + srcFileName + " target dir: " + dstFileName);
    System.out.println("Min term freq: " + minTermFreq + " Max # of terms: " + maxTermQty);
    System.out.println("SampleQty:" + sampleQty);
    System.out.println("Ignore duplicates within a session: " + ignoreSessionDuplicates);

    try {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName)));

        FreqWordDict dict = new FreqWordDict(reader, FIELD_NAME, minTermFreq, maxTermQty);

        if (DEBUG) {
            Iterator<Entry<TermDesc, Integer>> iter = dict.getTermIterator();

            int termId = 0;

            while (iter.hasNext()) {
                Entry<TermDesc, Integer> e = iter.next();

                TermDesc ts = e.getKey();
                System.out.println(termId + ":" + ts.getText());
                ++termId;
            }
        }

        File srcFile = new File(srcFileName);

        // supports either gzip, bzip2, or regular text file, 
        // detects type by extension
        InputStream inputStream = StreamUtils.inputStream(srcFile);

        BufferedReader logReader = new BufferedReader(new InputStreamReader(inputStream),
                StreamUtils.BUFFER_SIZE);

        String line;
        String prevSessID = "";

        HashSet<String> sessQueries = new HashSet<String>();

        ArrayList<String> unparsedQueries = new ArrayList<String>();

        CharArraySet stopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;

        while ((line = logReader.readLine()) != null) {
            String parts[] = line.split("\t");

            String sessID = parts[0];
            // Ignore the header line
            if (sessID.equals("AnonID"))
                continue;

            String q = parts[1];

            if (!sessID.equals(prevSessID)) {
                prevSessID = sessID;
                sessQueries.clear();
            }

            if (ignoreSessionDuplicates && sessQueries.contains(q))
                continue;

            if (ignoreSessionDuplicates)
                sessQueries.add(q);

            unparsedQueries.add(q);
        }

        logReader.close();

        if (sampleQty == -1)
            sampleQty = unparsedQueries.size();

        sampleQty = Math.min(sampleQty, unparsedQueries.size());

        int unusedQty = unparsedQueries.size();
        BitSet usedQueries = new BitSet(unusedQty);

        outWriter = new FileOutputStream(new File(dstFileName));

        while (sampleQty > 0 && unusedQty > 0 && sampleQty <= unusedQty) {
            int qr = (int) Math.floor(Math.random() * unusedQty);
            int qn = 0;

            for (int i = 0; i < usedQueries.cardinality(); ++i) {
                if (usedQueries.get(i)) {
                    ++qn;
                    continue;
                }
                if (--qr < 0) {
                    qn = i;
                    break;
                }
            }

            if (usedQueries.get(qn)) {
                throw new Exception("Bug: selected an already used query!");
            }

            String q = unparsedQueries.get(qn);

            String queryParts[] = q.split("\\s+");

            String res = "";
            int querySize = 0;

            for (String s : queryParts) {
                // We need to ignore stop words, but not the original queries
                if (stopWords.contains(s)) {
                    continue;
                } else {
                    ++querySize;
                }

                Integer pos = dict.getTermPos(s);
                if (pos == null) {
                    res = "";
                    break;
                }
                String posStr = DEBUG ? s + ":" + pos : pos.toString();
                res = res.isEmpty() ? posStr : res + " " + posStr;
            }

            if (!res.isEmpty() && querySize >= minQuerySize) {
                outWriter.write(res.getBytes());
                outWriter.write('\n');
                sampleQty--;
            }

            usedQueries.set(qn);
            unusedQty--;
        }

        if (sampleQty > 0) {
            throw new Exception("Failed to obtained a required number of queries, " + sampleQty
                    + " are not found." + " Please, ask for fewer queries to be converted.");
        }

    } catch (Exception e) {
        System.err.println("Error: " + e.getMessage());
        e.printStackTrace();
        System.exit(1);
    }

}

From source file:info.boytsov.lucene.parsers.ClueWeb09ContentSource.java

License:Open Source License

void openNextFile() throws NoMoreDataException, IOException {
    close();//w ww .j  a v a  2  s  . c o m

    while (true) {
        if (nextFile >= inputFiles.size()) {
            // exhausted files, start a new round, unless forever set to false.
            if (!forever) {
                throw new NoMoreDataException();
            }
            nextFile = 0;
            iteration++;
        }
        File f = inputFiles.get(nextFile++);
        if (verbose) {
            System.out.println("opening: " + f + " length: " + f.length());
        }
        try {
            // supports gzip, bzip2, or regular text file, extension is used to detect
            InputStream inputStream = StreamUtils.inputStream(f);
            reader = new DataInputStream(inputStream);
            return;
        } catch (Exception e) {
            if (verbose) {
                System.out.println("Skipping 'bad' file " + f.getAbsolutePath() + " due to " + e.getMessage());
                continue;
            }
            throw new NoMoreDataException();
        }
    }
}

From source file:info.boytsov.lucene.parsers.EnwikiContentSource.java

License:Apache License

/** Open the input stream. */
protected InputStream openInputStream() throws IOException {
    return StreamUtils.inputStream(file);
}

From source file:info.boytsov.lucene.parsers.TrecContentSource.java

License:Apache License

void openNextFile() throws NoMoreDataException, IOException {
    close();//w ww.  j a va 2  s. c  o m
    currPathType = null;
    while (true) {
        if (nextFile >= inputFiles.size()) {
            // exhausted files, start a new round, unless forever set to false.
            if (!forever) {
                throw new NoMoreDataException();
            }
            nextFile = 0;
            iteration++;
        }
        File f = inputFiles.get(nextFile++);
        if (verbose) {
            System.out.println("opening: " + f + " length: " + f.length());
        }
        try {
            InputStream inputStream = StreamUtils.inputStream(f); // support either gzip, bzip2, or regular text file, by extension  
            reader = new BufferedReader(new InputStreamReader(inputStream, encoding), StreamUtils.BUFFER_SIZE);
            currPathType = TrecDocParser.pathType(f);
            return;
        } catch (Exception e) {
            if (verbose) {
                System.out.println("Skipping 'bad' file " + f.getAbsolutePath() + " due to " + e.getMessage());
                continue;
            }
            throw new NoMoreDataException();
        }
    }
}

From source file:parsers.ClueWebContentSource.java

License:Open Source License

void openNextFile() throws NoMoreDataException, IOException {
    close();//from   w w  w.ja v  a2  s.  co  m

    while (true) {
        if (nextFile >= inputFiles.size()) {
            // exhausted files, start a new round, unless forever set to false.
            if (!forever) {
                throw new NoMoreDataException();
            }
            nextFile = 0;
            iteration++;
        }
        Path f = inputFiles.get(nextFile++);
        if (verbose) {
            System.out.println("opening: " + f + " length: " + f.toFile().length());
        }
        try {
            // supports gzip, bzip2, or regular text file, extension is used to detect
            InputStream inputStream = StreamUtils.inputStream(f);
            reader = new DataInputStream(inputStream);
            return;
        } catch (Exception e) {
            if (verbose) {
                System.out.println(
                        "Skipping 'bad' file " + f.toFile().getAbsolutePath() + " due to " + e.getMessage());
                continue;
            }
            throw new NoMoreDataException();
        }
    }
}

From source file:parsers.EnwikiContentSource.java

License:Apache License

/** Open the input stream. */
protected InputStream openInputStream() throws IOException {
    return StreamUtils.inputStream(file.toPath());
}

From source file:parsers.TrecContentSource.java

License:Apache License

void openNextFile() throws NoMoreDataException, IOException {
    close();/*from w w  w . j a v a  2s.c  o  m*/
    currPathType = null;
    while (true) {
        if (nextFile >= inputFiles.size()) {
            // exhausted files, start a new round, unless forever set to false.
            if (!forever) {
                throw new NoMoreDataException();
            }
            nextFile = 0;
            iteration++;
        }
        Path f = inputFiles.get(nextFile++);
        if (verbose) {
            System.out.println("opening: " + f + " length: " + f.toFile().length());
        }
        try {
            InputStream inputStream = StreamUtils.inputStream(f); // support either gzip, bzip2, or regular text file, by extension  
            reader = new BufferedReader(new InputStreamReader(inputStream, encoding), StreamUtils.BUFFER_SIZE);
            currPathType = TrecDocParser.pathType(f.toFile());
            return;
        } catch (Exception e) {
            if (verbose) {
                System.out.println(
                        "Skipping 'bad' file " + f.toFile().getAbsolutePath() + " due to " + e.getMessage());
                continue;
            }
            throw new NoMoreDataException();
        }
    }
}