Example usage for org.apache.hadoop.util LineReader readLine

List of usage examples for org.apache.hadoop.util LineReader readLine

Introduction

In this page you can find the example usage for org.apache.hadoop.util LineReader readLine.

Prototype

public int readLine(Text str) throws IOException 

Source Link

Document

Read from the InputStream into the given Text.

Usage

From source file:WikipediaForwardIndexBuilder.java

License:Apache License

@SuppressWarnings("static-access")
@Override/*w  ww.  j  a  v a  2  s  . c o  m*/
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg()
            .withDescription("two-letter language code").create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION));
    String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION);

    String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000);

    if (!inputPath.isAbsolute()) {
        System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!");
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class);
    FileSystem fs = FileSystem.get(conf);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - index file: " + indexFile);
    LOG.info(" - language: " + language);
    LOG.info("Note: This tool only works on block-compressed SequenceFiles!");

    conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language));

    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(tmpPath));
    FileOutputFormat.setCompressOutput(conf, false);

    if (language != null) {
        conf.set("wiki.language", language);
    }

    conf.setInputFormat(NoSplitSequenceFileInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(IdentityReducer.class);

    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpPath), true);

    RunningJob job = JobClient.runJob(conf);

    Counters counters = job.getCounters();
    int blocks = (int) counters.getCounter(Blocks.Total);

    LOG.info("number of blocks: " + blocks);

    LOG.info("Writing index file...");
    LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000")));
    FSDataOutputStream out = fs.create(new Path(indexFile), true);

    out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName());
    out.writeUTF(inputPath.toString());
    out.writeInt(blocks);

    int cnt = 0;
    Text line = new Text();
    while (reader.readLine(line) > 0) {
        String[] arr = line.toString().split("\\s+");

        int docno = Integer.parseInt(arr[0]);
        int offset = Integer.parseInt(arr[1]);
        short fileno = Short.parseShort(arr[2]);

        out.writeInt(docno);
        out.writeInt(offset);
        out.writeShort(fileno);

        cnt++;

        if (cnt % 100000 == 0) {
            LOG.info(cnt + " blocks written");
        }
    }

    reader.close();
    out.close();

    if (cnt != blocks) {
        throw new RuntimeException("Error: mismatch in block count!");
    }

    // Clean up.
    fs.delete(new Path(tmpPath), true);

    return 0;
}

From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java

License:Apache License

/**
 * Gets the different file splits for the data based on a given number of splits
 * //from   w ww . ja  v  a  2  s .c o m
 * @param status
 *          file status
 * @param conf
 *          hadoop configuration object
 * @param numberOfSplits
 *          number of splits to split the data in
 * @return list of file splits
 * @throws IOException
 *           thrown if the file does not exist
 */
public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numberOfSplits)
        throws IOException {
    List<FileSplit> splits = newArrayList();
    Path fileName = status.getPath();
    if (status.isDir()) {
        throw new IOException("Not a file: " + fileName);
    }
    long totalNumberOfLines = getTotalNumberOfLines(conf, fileName);
    int numLinesPerSplit = (int) Math.ceil(1.0 * totalNumberOfLines / numberOfSplits);
    LineReader lr = null;
    FSDataInputStream in = null;
    try {
        in = fileName.getFileSystem(conf).open(fileName);
        lr = new LineReader(in, conf);
        Text line = new Text();
        int numLines = 0;
        long begin = 0;
        long length = 0;
        int num = -1;
        while ((num = lr.readLine(line)) > 0) {
            numLines++;
            length += num;
            if (numLines == numLinesPerSplit) {
                splits.add(createFileSplit(fileName, begin, length));
                begin += length;
                length = 0;
                numLines = 0;
            }
        }
        if (numLines != 0) {
            splits.add(createFileSplit(fileName, begin, length));
        }
    } finally {
        if (lr != null) {
            lr.close();
        }
        if (in != null) {
            in.close();
        }
    }
    return splits;
}

From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java

License:Apache License

/**
 * Gets the total number of lines from the file. If Config.NUMBER_OF_LINES_KEY is set, this value is returned.
 * //w ww.ja v a 2s.  c  o  m
 * @param conf
 *          hadoop configuration object
 * @param fileName
 *          name of file to count
 * @return the number of lines in the file
 * @throws IOException
 */
public static long getTotalNumberOfLines(Configuration conf, Path fileName) throws IOException {
    long nrLines = conf.getLong(NUMBER_OF_LINES_KEY, -1);
    if (nrLines != -1) {
        return nrLines;
    }

    try {
        FSDataInputStream in = fileName.getFileSystem(conf).open(fileName);
        LineReader lr = new LineReader(in, conf);
        Text text = new Text();
        nrLines = 0;
        while (lr.readLine(text) > 0) {
            nrLines++;
        }
        in.close();
        return nrLines;
    } catch (IOException e) {
        e.printStackTrace();
    }
    return 0;
}

From source file:bixo.examples.crawl.MultiDomainUrlFilter.java

License:Apache License

public MultiDomainUrlFilter(Path filterFile) throws Exception {
    //we could require a filter file and put these in all urls or leave them here
    _suffixExclusionPattern = Pattern.compile("(?i)\\.(pdf|zip|gzip|gz|sit|bz|bz2|tar|tgz|exe)$");
    _protocolInclusionPattern = Pattern.compile("(?i)^(http|https)://");

    JobConf conf = HadoopUtils.getDefaultJobConf();
    try {//process the file passed in
        if (filterFile != null) {
            FileSystem fs = filterFile.getFileSystem(conf);
            if (fs.exists(filterFile)) {
                FSDataInputStream in = fs.open(filterFile);
                LineReader lr = new LineReader(in);
                Text tmpStr = new Text();
                while (lr.readLine(tmpStr) > 0 && !tmpStr.toString().equals("")) {//skip blank lines
                    String p = tmpStr.toString().trim();//remove whitespace
                    if (p.substring(0, 1).equals("+")) {// '+' means do-crawl
                        ArrayList filterPair = new ArrayList();
                        filterPair.add((Boolean) true);
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } else if (p.substring(0, 1).equals("-")) {// '-' means filter out
                        ArrayList filterPair = new ArrayList();
                        filterPair.add(new Boolean(false));
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } // otherwise a comment or malformed filter pattern
                }//from   w  w w.  jav  a  2s.  c om
            }
        }

    } catch (Exception e) {
        //any cleanup here? This would indicate a file system error, most likely
        throw e;
    }
}

From source file:bixo.examples.crawl.RegexUrlFilter.java

License:Apache License

public static List<String> getUrlFilterPatterns(String urlFiltersFile)
        throws IOException, InterruptedException {
    //this reads regex filters from a file in HDFS or the native file system
    JobConf conf = HadoopUtils.getDefaultJobConf();
    Path filterFile = new Path(urlFiltersFile);
    FileSystem fs = filterFile.getFileSystem(conf);
    List<String> filterList = new ArrayList<String>();
    LOGGER.info("Looking for file: " + urlFiltersFile);
    if (fs.exists(filterFile)) {
        FSDataInputStream in = fs.open(filterFile);
        LineReader reader = new LineReader(in);
        Text tLine = new Text();
        while (reader.readLine(tLine) > 0) {
            String line = tLine.toString();
            if (StringUtils.isNotBlank(line)
                    && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) {
                filterList.add(line.trim());
            }//w  ww.  j  a v  a2s  . c  o m
        }
        in.close();
    } else {
        LOGGER.info("Can't find file: " + urlFiltersFile);
    }
    return filterList;
}

From source file:boostingPL.boosting.InstancesHelper.java

License:Open Source License

/**
 * create instances header from metadata,
 * the metadata like this:/*from  w ww  .  ja  v a  2 s .  c om*/
 * 
 *   <br/>
 *   <p>attributesNum:100</p>
 *   <p>classes:+1,-1</p>
 *   <br/>
 * 
 * @param in
 * @return
 * @throws IOException
 */
public static Instances createInstancesFromMetadata(LineReader in) throws IOException {
    int attributesNum = 0;
    ArrayList<Attribute> attInfo = null;
    List<String> classItems = null;

    Text line = new Text();
    while (in.readLine(line) > 0) {
        String sline = line.toString();
        if (sline.startsWith("attributesNum:")) {
            attributesNum = Integer.parseInt(sline.substring(14));
            attInfo = new ArrayList<Attribute>(attributesNum + 1);
            for (int i = 0; i < attributesNum; i++) {
                attInfo.add(new Attribute("attr" + i));
            }

            System.out.println("AttributeNum:" + attributesNum);
        } else if (sline.startsWith("classes:")) {
            String classes = sline.substring(8);
            String[] citems = classes.split(",");
            classItems = new ArrayList<String>(citems.length);
            for (String s : citems) {
                classItems.add(s);
            }

            System.out.println("classes:" + classes);
        }
    }

    attInfo.add(new Attribute("class", classItems));
    Instances insts = new Instances("BoostingPL-dataset", attInfo, 0);
    insts.setClassIndex(insts.numAttributes() - 1);

    return insts;
}

From source file:bucket_sort.NLineInputFormat.java

License:Apache License

public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numLinesPerSplit)
        throws IOException {
    List<FileSplit> splits = new ArrayList<FileSplit>();
    Path fileName = status.getPath();
    if (status.isDir()) {
        throw new IOException("Not a file: " + fileName);
    }/*from  w w w  .j  a v  a 2s .co m*/
    FileSystem fs = fileName.getFileSystem(conf);
    LineReader lr = null;
    try {
        FSDataInputStream in = fs.open(fileName);
        lr = new LineReader(in, conf);
        Text line = new Text();
        int numLines = 0;
        long begin = 0;
        long length = 0;
        int num = -1;
        while ((num = lr.readLine(line)) > 0) {
            numLines++;
            length += num;
            if (numLines == numLinesPerSplit) {
                // NLineInputFormat uses LineRecordReader, which always reads
                // (and consumes) at least one character out of its upper split
                // boundary. So to make sure that each mapper gets N lines, we
                // move back the upper split limits of each split 
                // by one character here.
                if (begin == 0) {
                    splits.add(new FileSplit(fileName, begin, length - 1, new String[] {}));
                } else {
                    splits.add(new FileSplit(fileName, begin - 1, length, new String[] {}));
                }
                begin += length;
                length = 0;
                numLines = 0;
            }
        }
        if (numLines != 0) {
            splits.add(new FileSplit(fileName, begin, length, new String[] {}));
        }
    } finally {
        if (lr != null) {
            lr.close();
        }
    }
    return splits;
}

From source file:com.finderbots.miner.RegexUrlFilter.java

License:Apache License

public static List<String> getUrlFilterPatterns(String urlFiltersFile)
        throws IOException, InterruptedException {
    //this reads regex filters from a file in HDFS or the native file sysytem
    JobConf conf = HadoopUtils.getDefaultJobConf();
    Path filterFile = new Path(urlFiltersFile);
    FileSystem fs = filterFile.getFileSystem(conf);
    List<String> filterList = new ArrayList<String>();
    if (fs.exists(filterFile)) {
        FSDataInputStream in = fs.open(filterFile);
        LineReader reader = new LineReader(in);
        Text tLine = new Text();
        while (reader.readLine(tLine) > 0) {
            String line = tLine.toString();
            if (StringUtils.isNotBlank(line)
                    && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) {
                filterList.add(line.trim());
            }/*from   w  w  w  . j  a  v a 2 s .  com*/
        }
        in.close();
    }
    return filterList;
}

From source file:com.knewton.mrtool.io.JsonRecordReaderTest.java

License:Apache License

/**
 * Tests the line reader in the record reader to see if records can be read correctly from the
 * beginning of an input stream.//from w w  w. java 2  s.  c  o  m
 * 
 * @throws IOException
 * @throws InterruptedException
 */
@Test
public void testJsonRecordReader() throws IOException, InterruptedException {
    JsonRecordReader<Text> rr = new JsonRecordReader<Text>() {
        @Override
        protected Class<?> getDataClass(String jsonStr) {
            return Text.class;
        }
    };

    Configuration conf = new Configuration();
    TaskAttemptContext context = new TaskAttemptContext(conf, new TaskAttemptID());
    FileSplit fileSplit = new FileSplit(new Path("recs.2013-03-20_02_52.log"), 0, recommendationBytes.length,
            new String[0]);

    new MockUp<FileSystem>() {
        @Mock
        public FSDataInputStream open(Path f) throws IOException {
            return new FSDataInputStream(new SeekableByteArrayInputStream(recommendationBytes));
        }
    };
    // Initialize it to get the compression codecs
    rr.initialize(fileSplit, context);
    // close the line reader and reopen it.
    rr.close();
    LineReader lineReader = rr.initLineReader(fileSplit, conf);
    Text line = new Text();
    lineReader.readLine(line);
    assertEquals(DummyJsonRecommendations.jsonRecommendations[0], line.toString());

    line = new Text();
    lineReader.readLine(line);
    assertEquals(DummyJsonRecommendations.jsonRecommendations[1], line.toString());
    lineReader.close();
}

From source file:com.knewton.mrtool.io.JsonRecordReaderTest.java

License:Apache License

/**
 * Tests the line reader in the record reader to see if records can be read correctly from a
 * random seek location in the input stream.
 * //  w  w  w .  j ava2s .  co m
 * @throws IOException
 * @throws InterruptedException
 */
@Test
public void testJsonRecordReaderWithRandomPos() throws IOException, InterruptedException {
    JsonRecordReader<Text> rr = new JsonRecordReader<Text>() {
        @Override
        protected Class<?> getDataClass(String jsonStr) {
            return Text.class;
        }
    };

    Configuration conf = new Configuration();
    TaskAttemptContext context = new TaskAttemptContext(conf, new TaskAttemptID());
    FileSplit fileSplit = new FileSplit(new Path("recs.2013-03-20_02_52.log"), 10, recommendationBytes.length,
            new String[0]);

    new MockUp<FileSystem>() {
        @Mock
        public FSDataInputStream open(Path f) throws IOException {
            return new FSDataInputStream(new SeekableByteArrayInputStream(recommendationBytes));
        }
    };
    // Initialize it to get the compression codecs
    rr.initialize(fileSplit, context);
    // close the line reader and reopen it.
    rr.close();
    LineReader lineReader = rr.initLineReader(fileSplit, conf);
    Text line = new Text();
    lineReader.readLine(line);
    assertEquals(DummyJsonRecommendations.jsonRecommendations[1], line.toString());
    line = new Text();
    lineReader.readLine(line);
    assertTrue(line.toString().isEmpty());
    lineReader.close();
}