Example usage for org.apache.hadoop.util LineReader LineReader

List of usage examples for org.apache.hadoop.util LineReader LineReader

Introduction

In this page you can find the example usage for org.apache.hadoop.util LineReader LineReader.

Prototype

public LineReader(InputStream in) 

Source Link

Document

Create a line reader that reads from the given stream using the default buffer-size (64k).

Usage

From source file:WikipediaForwardIndexBuilder.java

License:Apache License

@SuppressWarnings("static-access")
@Override//from   w ww. ja va  2s.  co  m
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg()
            .withDescription("two-letter language code").create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION));
    String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION);

    String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);

    if (!inputPath.isAbsolute()) {
        System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!");
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(conf);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - language: " + language);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");

    conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language));

    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(conf, false);

    if (language != null) {
        conf.set("wiki.language", language);
    }

    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);

    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpPath), true);

    RunningJob job = JobClient.runJob(conf);

    Counters counters = job.getCounters();
    int blocks = (int) counters.getCounter(Blocks.Total);

    LOG.info("number of blocks: " + blocks);

    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);

    out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName());
    out.writeUTF(inputPath.toString());
    out.writeInt(blocks);

    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");

        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);

        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);

        cnt++;

        if (cnt % 100000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }

    reader.close();
    out.close();

    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }

    // Clean up.
    fs.delete(new Path(tmpPath), true);

    return 0;
}

From source file:bixo.examples.crawl.MultiDomainUrlFilter.java

License:Apache License

public MultiDomainUrlFilter(Path filterFile) throws Exception {
    //we could require a filter file and put these in all urls or leave them here
    _suffixExclusionPattern = Pattern.compile("(?i)\\.(pdf|zip|gzip|gz|sit|bz|bz2|tar|tgz|exe)$");
    _protocolInclusionPattern = Pattern.compile("(?i)^(http|https)://");

    JobConf conf = HadoopUtils.getDefaultJobConf();
    try {//process the file passed in
        if (filterFile != null) {
            FileSystem fs = filterFile.getFileSystem(conf);
            if (fs.exists(filterFile)) {
                FSDataInputStream in = fs.open(filterFile);
                LineReader lr = new LineReader(in);
                Text tmpStr = new Text();
                while (lr.readLine(tmpStr) > 0 && !tmpStr.toString().equals("")) {//skip blank lines
                    String p = tmpStr.toString().trim();//remove whitespace
                    if (p.substring(0, 1).equals("+")) {// '+' means do-crawl
                        ArrayList filterPair = new ArrayList();
                        filterPair.add((Boolean) true);
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } else if (p.substring(0, 1).equals("-")) {// '-' means filter out
                        ArrayList filterPair = new ArrayList();
                        filterPair.add(new Boolean(false));
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } // otherwise a comment or malformed filter pattern
                }//  w  w  w . j a  va  2s  .  c  o m
            }
        }

    } catch (Exception e) {
        //any cleanup here? This would indicate a file system error, most likely
        throw e;
    }
}

From source file:bixo.examples.crawl.RegexUrlFilter.java

License:Apache License

public static List<String> getUrlFilterPatterns(String urlFiltersFile)
        throws IOException, InterruptedException {
    //this reads regex filters from a file in HDFS or the native file system
    JobConf conf = HadoopUtils.getDefaultJobConf();
    Path filterFile = new Path(urlFiltersFile);
    FileSystem fs = filterFile.getFileSystem(conf);
    List<String> filterList = new ArrayList<String>();
    LOGGER.info("Looking for file: " + urlFiltersFile);
    if (fs.exists(filterFile)) {
        FSDataInputStream in = fs.open(filterFile);
        LineReader reader = new LineReader(in);
        Text tLine = new Text();
        while (reader.readLine(tLine) > 0) {
            String line = tLine.toString();
            if (StringUtils.isNotBlank(line)
                    && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) {
                filterList.add(line.trim());
            }//w  ww.jav  a  2  s  .  c o  m
        }
        in.close();
    } else {
        LOGGER.info("Can't find file: " + urlFiltersFile);
    }
    return filterList;
}

From source file:boostingPL.MR.AdaBoostPLMapper.java

License:Open Source License

/** create instances header */
protected void setup(Context context) throws IOException, InterruptedException {
    String pathSrc = context.getConfiguration().get("BoostingPL.metadata");
    FileSystem hdfs = FileSystem.get(context.getConfiguration());
    FSDataInputStream dis = new FSDataInputStream(hdfs.open(new Path(pathSrc)));
    LineReader in = new LineReader(dis);
    insts = InstancesHelper.createInstancesFromMetadata(in);
    in.close();//from   w  ww  .  j  a  va2 s  .  c o m
    dis.close();
}

From source file:boostingPL.MR.AdaBoostPLTestMapper.java

License:Open Source License

protected void setup(Context context) throws IOException, InterruptedException {
    // classifier file
    Path path = new Path(context.getConfiguration().get("BoostingPL.modelPath") + "/part-r-00000");
    String boostingName = context.getConfiguration().get("BoostingPL.boostingName");
    boostingPL = BoostingPLFactory.createBoostingPL(boostingName, context.getConfiguration(), path);

    // testing dataset metadata
    String pathSrc = context.getConfiguration().get("BoostingPL.metadata");
    FileSystem hdfs = FileSystem.get(context.getConfiguration());
    FSDataInputStream dis = new FSDataInputStream(hdfs.open(new Path(pathSrc)));
    LineReader in = new LineReader(dis);
    insts = InstancesHelper.createInstancesFromMetadata(in);
    in.close();//from  w  w w . j a  va  2 s  . c  o m
    dis.close();

    try {
        eval = new Evaluation(insts);
    } catch (Exception e) {
        LOG.error("[BoostingPL-Test]: Evaluation init error!");
        e.printStackTrace();
    }
    instanceCounter = context.getCounter("BoostingPL", "Number of instances");
}

From source file:boostingPL.MR.AdaBoostPLTestReducer.java

License:Open Source License

protected void setup(Context context) throws IOException, InterruptedException {
    // classifier file
    Path path = new Path(context.getConfiguration().get("BoostingPL.modelPath") + "/part-r-00000");
    String boostingName = context.getConfiguration().get("BoostingPL.boostingName");
    boostingPL = BoostingPLFactory.createBoostingPL(boostingName, context.getConfiguration(), path);

    // testing dataset metadata
    String pathSrc = context.getConfiguration().get("BoostingPL.metadata");
    FileSystem hdfs = FileSystem.get(context.getConfiguration());
    FSDataInputStream dis = new FSDataInputStream(hdfs.open(new Path(pathSrc)));
    LineReader in = new LineReader(dis);
    insts = InstancesHelper.createInstancesFromMetadata(in);
    in.close();/*from  ww  w .  j  a v a2  s.co m*/
    dis.close();

    try {
        eval = new Evaluation(insts);
    } catch (Exception e) {
        LOG.error("[BoostingPL-Test]: Evaluation init error!");
        e.printStackTrace();
    }
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Builds a new record reader given a config file and an input split.
 *
 * @param conf The Hadoop configuration object. Used for gaining access
 *   to the underlying file system./*from   ww w  . j  a  v a 2 s  .co m*/
 * @param split The file split to read.
 */
protected FastqRecordReader(final Configuration conf, final FileSplit split) throws IOException {
    file = split.getPath();
    start = split.getStart();
    end = start + split.getLength();

    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(file);

    CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf);
    CompressionCodec codec = codecFactory.getCodec(file);

    if (codec == null) { // no codec.  Uncompressed file.
        positionAtFirstRecord(fileIn);
        inputStream = fileIn;
    } else {
        // compressed file
        if (start != 0) {
            throw new RuntimeException("Start position for compressed file is not 0! (found " + start + ")");
        }

        inputStream = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE; // read until the end of the file
    }

    lineReader = new LineReader(inputStream);
}

From source file:brush.FastqRecordReader.java

License:Apache License

/**
 * Position the input stream at the start of the first record.
 *
 * @param stream The stream to reposition.
 *//*from   www.  j av  a  2 s . c o  m*/
protected void positionAtFirstRecord(FSDataInputStream stream) throws IOException {
    Text buffer = new Text();

    if (true) { // (start > 0) // use start>0 to assume that files start with valid data
        // Advance to the start of the first record that ends with /1
        // We use a temporary LineReader to read lines until we find the
        // position of the right one.  We then seek the file to that position.
        stream.seek(start);
        LineReader reader = new LineReader(stream);

        int bytesRead = 0;
        do {
            bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
            int bufferLength = buffer.getLength();
            if (bytesRead > 0 && !checkBuffer(bufferLength, buffer)) {
                start += bytesRead;
            } else {
                // line starts with @.  Read two more and verify that it starts with a +
                //
                // If this isn't the start of a record, we want to backtrack to its end
                long backtrackPosition = start + bytesRead;

                bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
                bytesRead = reader.readLine(buffer, (int) Math.min(MAX_LINE_LENGTH, end - start));
                if (bytesRead > 0 && buffer.getLength() > 0 && buffer.getBytes()[0] == '+') {
                    break; // all good!
                } else {
                    // backtrack to the end of the record we thought was the start.
                    start = backtrackPosition;
                    stream.seek(start);
                    reader = new LineReader(stream);
                }
            }
        } while (bytesRead > 0);

        stream.seek(start);
    }

    pos = start;
}

From source file:com.finderbots.miner.RegexUrlFilter.java

License:Apache License

public static List<String> getUrlFilterPatterns(String urlFiltersFile)
        throws IOException, InterruptedException {
    //this reads regex filters from a file in HDFS or the native file sysytem
    JobConf conf = HadoopUtils.getDefaultJobConf();
    Path filterFile = new Path(urlFiltersFile);
    FileSystem fs = filterFile.getFileSystem(conf);
    List<String> filterList = new ArrayList<String>();
    if (fs.exists(filterFile)) {
        FSDataInputStream in = fs.open(filterFile);
        LineReader reader = new LineReader(in);
        Text tLine = new Text();
        while (reader.readLine(tLine) > 0) {
            String line = tLine.toString();
            if (StringUtils.isNotBlank(line)
                    && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) {
                filterList.add(line.trim());
            }// w  w  w  .  j  a va  2 s  . co m
        }
        in.close();
    }
    return filterList;
}

From source file:com.redsqirl.workflow.server.connect.HDFSInterface.java

License:Open Source License

/**
 * Read rows from the path provide/*  w ww .  j  a v a 2s  .  c o m*/
 * 
 * @param path
 * @param delimiter
 * @param maxToRead
 * @throws RemoteException
 */
@Override
public List<String> select(String path, String delimiter, int maxToRead) throws RemoteException {
    Path p = new Path(path);
    List<String> ans = null;
    HdfsFileChecker fCh = new HdfsFileChecker(p);
    try {
        FileSystem fs = NameNodeVar.getFS();
        if (fCh.isDirectory()) {
            FileStatus[] fsA = fs.listStatus(p);
            int listSize = Math.min(maxToRead, fsA.length);
            ans = new ArrayList<String>(listSize);
            for (int i = 0; i < listSize; ++i) {
                ans.add(fsA[i].getPath().toString());
            }
        } else if (fCh.isFile()) {
            InputStream inS = fs.open(p);
            InputStream in = null;
            InputStream compressedReader = null;
            if (path.endsWith(".bz2") || path.endsWith(".bz")) {
                compressedReader = new BZip2CompressorInputStream(inS);
                in = compressedReader;
            } else if (path.endsWith(".gz")) {
                compressedReader = new GZIPInputStream(inS);
                in = compressedReader;
            } else {
                in = inS;
            }

            LineReader reader = new LineReader(in);
            ans = new ArrayList<String>(maxToRead);
            Text line = new Text();
            int lineNb = 0;
            while (reader.readLine(line) != 0 && lineNb < maxToRead) {
                ans.add(line.toString());
                ++lineNb;
            }
            if (compressedReader != null) {
                compressedReader.close();
            }
            inS.close();
        }
        // fs.close();
    } catch (IOException e) {
        logger.error("Cannot select the file or directory: " + p);
        logger.error(e.getMessage(), e);
    }
    // fCh.close();

    return ans;
}