Example usage for org.apache.hadoop.util LineReader LineReader

Introduction

In this page you can find the example usage for org.apache.hadoop.util LineReader LineReader.

Prototype

public LineReader(InputStream in)

Source Link

Document

Create a line reader that reads from the given stream using the default buffer-size (64k).

Usage

From source file:WikipediaForwardIndexBuilder.java

License:Apache License

@SuppressWarnings("static-access")
@Override//from   w ww. ja va  2s.  co  m
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg()
            .withDescription("two-letter language code").create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION));
    String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION);

    String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);

    if (!inputPath.isAbsolute()) {
        System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!");
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(conf);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - language: " + language);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");

    conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language));

    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(conf, false);

    if (language != null) {
        conf.set("wiki.language", language);
    }

    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);

    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpPath), true);

    RunningJob job = JobClient.runJob(conf);

    Counters counters = job.getCounters();
    int blocks = (int) counters.getCounter(Blocks.Total);

    LOG.info("number of blocks: " + blocks);

    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);

    out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName());
    out.writeUTF(inputPath.toString());
    out.writeInt(blocks);

    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");

        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);

        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);

        cnt++;

        if (cnt % 100000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }

    reader.close();
    out.close();

    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }

    // Clean up.
    fs.delete(new Path(tmpPath), true);

    return 0;
}

From source file:bixo.examples.crawl.MultiDomainUrlFilter.java

License:Apache License

public MultiDomainUrlFilter(Path filterFile) throws Exception {
    //we could require a filter file and put these in all urls or leave them here
    _suffixExclusionPattern = Pattern.compile("(?i)\\.(pdf|zip|gzip|gz|sit|bz|bz2|tar|tgz|exe)$");
    _protocolInclusionPattern = Pattern.compile("(?i)^(http|https)://");

    JobConf conf = HadoopUtils.getDefaultJobConf();
    try {//process the file passed in
        if (filterFile != null) {
            FileSystem fs = filterFile.getFileSystem(conf);
            if (fs.exists(filterFile)) {
                FSDataInputStream in = fs.open(filterFile);
                LineReader lr = new LineReader(in);
                Text tmpStr = new Text();
                while (lr.readLine(tmpStr) > 0 && !tmpStr.toString().equals("")) {//skip blank lines
                    String p = tmpStr.toString().trim();//remove whitespace
                    if (p.substring(0, 1).equals("+")) {// '+' means do-crawl
                        ArrayList filterPair = new ArrayList();
                        filterPair.add((Boolean) true);
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } else if (p.substring(0, 1).equals("-")) {// '-' means filter out
                        ArrayList filterPair = new ArrayList();
                        filterPair.add(new Boolean(false));
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } // otherwise a comment or malformed filter pattern
                }//  w  w  w . j a  va  2s  .  c  o m
            }
        }

    } catch (Exception e) {
        //any cleanup here? This would indicate a file system error, most likely
        throw e;
    }
}

From source file:bixo.examples.crawl.RegexUrlFilter.java

License:Apache License

public static List<String> getUrlFilterPatterns(String urlFiltersFile)
        throws IOException, InterruptedException {
    //this reads regex filters from a file in HDFS or the native file system
    JobConf conf = HadoopUtils.getDefaultJobConf();
    Path filterFile = new Path(urlFiltersFile);
    FileSystem fs = filterFile.getFileSystem(conf);
    List<String> filterList = new ArrayList<String>();
    LOGGER.info("Looking for file: " + urlFiltersFile);
    if (fs.exists(filterFile)) {
        FSDataInputStream in = fs.open(filterFile);
        LineReader reader = new LineReader(in);
        Text tLine = new Text();
        while (reader.readLine(tLine) > 0) {
            String line = tLine.toString();
            if (StringUtils.isNotBlank(line)
                    && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) {
                filterList.add(line.trim());
            }//w  ww.jav  a  2  s  .  c o  m
        }
        in.close();
    } else {
        LOGGER.info("Can't find file: " + urlFiltersFile);
    }
    return filterList;
}

From source file:boostingPL.MR.AdaBoostPLMapper.java

License:Open Source License

/** create instances header */
protected void setup(Context context) throws IOException, InterruptedException {
    String pathSrc = context.getConfiguration().get("BoostingPL.metadata");
    FileSystem hdfs = FileSystem.get(context.getConfiguration());
    FSDataInputStream dis = new FSDataInputStream(hdfs.open(new Path(pathSrc)));
    LineReader in = new LineReader(dis);
    insts = InstancesHelper.createInstancesFromMetadata(in);
    in.close();//from   w  ww  .  j  a  va2 s  .  c o m
    dis.close();
}

From source file:boostingPL.MR.AdaBoostPLTestMapper.java

License:Open Source License

protected void setup(Context context) throws IOException, InterruptedException {
    // classifier file
    Path path = new Path(context.getConfiguration().get("BoostingPL.modelPath") + "/part-r-00000");
    String boostingName = context.getConfiguration().get("BoostingPL.boostingName");
    boostingPL = BoostingPLFactory.createBoostingPL(boostingName, context.getConfiguration(), path);

    // testing dataset metadata
    String pathSrc = context.getConfiguration().get("BoostingPL.metadata");
    FileSystem hdfs = FileSystem.get(context.getConfiguration());
    FSDataInputStream dis = new FSDataInputStream(hdfs.open(new Path(pathSrc)));
    LineReader in = new LineReader(dis);
    insts = InstancesHelper.createInstancesFromMetadata(in);
    in.close();//from  w  w w . j a  va  2 s  . c  o m
    dis.close();

    try {
        eval = new Evaluation(insts);
    } catch (Exception e) {
        LOG.error("[BoostingPL-Test]: Evaluation init error!");
        e.printStackTrace();
    }
    instanceCounter = context.getCounter("BoostingPL", "Number of instances");
}

From source file:boostingPL.MR.AdaBoostPLTestReducer.java

License:Open Source License

protected void setup(Context context) throws IOException, InterruptedException {
    // classifier file
    Path path = new Path(context.getConfiguration().get("BoostingPL.modelPath") + "/part-r-00000");
    String boostingName = context.getConfiguration().get("BoostingPL.boostingName");
    boostingPL = BoostingPLFactory.createBoostingPL(boostingName, context.getConfiguration(), path);

    // testing dataset metadata
    String pathSrc = context.getConfiguration().get("BoostingPL.metadata");
    FileSystem hdfs = FileSystem.get(context.getConfiguration());
    FSDataInputStream dis = new FSDataInputStream(hdfs.open(new Path(pathSrc)));
    LineReader in = new LineReader(dis);
    insts = InstancesHelper.createInstancesFromMetadata(in);
    in.close();/*from  ww  w .  j  a v a2  s.co m*/
    dis.close();

    try {
        eval = new Evaluation(insts);
    } catch (Exception e) {
        LOG.error("[BoostingPL-Test]: Evaluation init error!");
        e.printStackTrace();
    }
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Builds a new record reader given a config file and an input split.
 *
 * @param conf The Hadoop configuration object. Used for gaining access
 *   to the underlying file system./*from   ww w  . j  a  v a 2 s  .co m*/
 * @param split The file split to read.
 */
protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException {
    file = split.getPath();
    start = split.getStart();
    end = start + split.getLength();

    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(file);

    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(file);

    if (codec == null) { // no codec.  Uncompressed file.
        positionAtFirstRecord(fileIn);
        inputStream = fileIn;
    } else {
        // compressed file
        if (start != 0) {
            throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");
        }

        inputStream = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE; // read until the end of the file
    }

    lineReader = new LineReader(inputStream);
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Position the input stream at the start of the first record.
 *
 * @param stream The stream to reposition.
 *//*from   www.  j av  a  2 s . c o  m*/
protected void positionAtFirstRecord(FSDataInputStream stream) throws IOException {
    Text buffer = new Text();

    if (true) { // (start > 0) // use start>0 to assume that files start with valid data
        // Advance to the start of the first record that ends with /1
        // We use a temporary LineReader to read lines until we find the
        // position of the right one.  We then seek the file to that position.
        stream.seek(start);
        LineReader reader = new LineReader(stream);

        int bytesRead = 0;
        do {
            bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
            int bufferLength = buffer.getLength();
            if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) {
                start += bytesRead;
            } else {
                // line starts with @.  Read two more and verify that it starts with a +
                //
                // If this isn't the start of a record, we want to backtrack to its end
                long backtrackPosition = start + bytesRead;

                bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
                bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
                if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') {
                    break; // all good!
                } else {
                    // backtrack to the end of the record we thought was the start.
                    start = backtrackPosition;
                    stream.seek(start);
                    reader = new LineReader(stream);
                }
            }
        } while (bytesRead > 0);

        stream.seek(start);
    }

    pos = start;
}

From source file:com.finderbots.miner.RegexUrlFilter.java

License:Apache License

public static List<String> getUrlFilterPatterns(String urlFiltersFile)
        throws IOException, InterruptedException {
    //this reads regex filters from a file in HDFS or the native file sysytem
    JobConf conf = HadoopUtils.getDefaultJobConf();
    Path filterFile = new Path(urlFiltersFile);
    FileSystem fs = filterFile.getFileSystem(conf);
    List<String> filterList = new ArrayList<String>();
    if (fs.exists(filterFile)) {
        FSDataInputStream in = fs.open(filterFile);
        LineReader reader = new LineReader(in);
        Text tLine = new Text();
        while (reader.readLine(tLine) > 0) {
            String line = tLine.toString();
            if (StringUtils.isNotBlank(line)
                    && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) {
                filterList.add(line.trim());
            }// w  w  w  .  j  a va  2 s  . co m
        }
        in.close();
    }
    return filterList;
}

From source file:com.redsqirl.workflow.server.connect.HDFSInterface.java

License:Open Source License

/**
 * Read rows from the path provide/*  w ww .  j  a v a 2s  .  c o m*/
 * 
 * @param path
 * @param delimiter
 * @param maxToRead
 * @throws RemoteException
 */
@Override
public List<String> select(String path, String delimiter, int maxToRead) throws RemoteException {
    Path p = new Path(path);
    List<String> ans = null;
    HdfsFileChecker fCh = new HdfsFileChecker(p);
    try {
        FileSystem fs = NameNodeVar.getFS();
        if (fCh.isDirectory()) {
            FileStatus[] fsA = fs.listStatus(p);
            int listSize = Math.min(maxToRead, fsA.length);
            ans = new ArrayList<String>(listSize);
            for (int i = 0; i < listSize; ++i) {
                ans.add(fsA[i].getPath().toString());
            }
        } else if (fCh.isFile()) {
            InputStream inS = fs.open(p);
            InputStream in = null;
            InputStream compressedReader = null;
            if (path.endsWith(".bz2") || path.endsWith(".bz")) {
                compressedReader = new BZip2CompressorInputStream(inS);
                in = compressedReader;
            } else if (path.endsWith(".gz")) {
                compressedReader = new GZIPInputStream(inS);
                in = compressedReader;
            } else {
                in = inS;
            }

            LineReader reader = new LineReader(in);
            ans = new ArrayList<String>(maxToRead);
            Text line = new Text();
            int lineNb = 0;
            while (reader.readLine(line) != 0 && lineNb < maxToRead) {
                ans.add(line.toString());
                ++lineNb;
            }
            if (compressedReader != null) {
                compressedReader.close();
            }
            inS.close();
        }
        // fs.close();
    } catch (IOException e) {
        logger.error("Cannot select the file or directory: " + p);
        logger.error(e.getMessage(), e);
    }
    // fCh.close();

    return ans;
}