Example usage for org.apache.hadoop.mapred TextInputFormat getRecordReader

List of usage examples for org.apache.hadoop.mapred TextInputFormat getRecordReader


In this page you can find the example usage for org.apache.hadoop.mapred TextInputFormat getRecordReader.


public RecordReader<LongWritable, Text> getRecordReader(InputSplit genericSplit, JobConf job, Reporter reporter)
            throws IOException 

Source Link


From source file:com.cloudera.knittingboar.io.TestSplitCalcs.java

License:Apache License

 * /*from  w w w. j  ava 2 s  .  co m*/
 * - use the TextInputFormat.getSplits() to test pulling split info
 * @throws IOException 
public void testGetSplits() throws IOException {

    TextInputFormat input = new TextInputFormat();

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testGetSplits.txt");

    int tmp_file_size = 200000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
        for (int i = 0; i < tmp_file_size; i++) {
                    "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
    } finally {

    System.out.println("file write complete");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;

    //    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    assertEquals(2, splits.length);

    System.out.println("---- debug splits --------- ");

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength() + ", " + splits[x].toString() + ", "
                + splits[x].getLocations()[0]);

        RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[x], job, reporter);
        try {
            int count = 0;
            while (reader.next(key, value)) {

                if (count == 0) {
                    System.out.println("first: " + value.toString());
                    assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p"));


            System.out.println("last: " + value.toString());

            assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p"));

        } finally {

    } // for each split


From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerLocal.java

License:Open Source License

 * //from www.j  av a 2s .  c  o  m
 * @param fname
 * @param fnameStaging
 * @param fnameNew
 * @param brlen
 * @param bclen
 * @throws DMLRuntimeException
private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen,
        int brlen, int bclen) throws DMLRuntimeException {
    long row = -1;
    long col = -1;

    try {
        //STEP 1: read matrix from HDFS and write blocks to local staging area         
        //check and add input path
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fname);
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        InputSplit[] splits = informat.getSplits(job, 1);

        LinkedList<Cell> buffer = new LinkedList<Cell>();
        LongWritable key = new LongWritable();
        Text value = new Text();
        FastStringTokenizer st = new FastStringTokenizer(' ');

        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
            try {
                while (reader.next(key, value)) {
                    st.reset(value.toString()); //reset tokenizer
                    row = st.nextLong();
                    col = st.nextLong();
                    double lvalue = st.nextDouble();
                    Cell tmp = new Cell(row, col, lvalue);

                    if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush
                        appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);

                //final flush
                if (!buffer.isEmpty()) {
                    appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
            } finally {
                if (reader != null)

        //STEP 2: read matrix blocks from staging area and write matrix to HDFS
        String[] fnamesPartitions = new File(fnameStaging).list();
        if (PARALLEL) {
            int len = Math.min(fnamesPartitions.length, _par);
            Thread[] threads = new Thread[len];
            for (int i = 0; i < len; i++) {
                int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
                int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
                end = Math.min(end, fnamesPartitions.length - 1);
                threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging,
                        fnamesPartitions, start, end));

            for (Thread t : threads)
        } else {
            for (String pdir : fnamesPartitions)
                writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
    } catch (Exception e) {
        //post-mortem error handling and bounds checking
        if (row < 1 || row > rlen || col < 1 || col > clen) {
            throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] "
                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else
            throw new DMLRuntimeException("Unable to partition text cell matrix.", e);

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Open Source License

 * /*from ww w . j a v a 2  s  . c  o m*/
 * @param fnameNew
 * @param outMo
 * @param inMO
 * @throws DMLRuntimeException
private void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO)
        throws DMLRuntimeException {
    try {
        //delete target file if already exists

            copyAllFiles(fnameNew, inMO);
            return; //we're done

        //actual merge
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        FileSystem fs = FileSystem.get(job);
        Path path = new Path(fnameNew);
        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));

        String valueStr = null;

        try {
            for (MatrixObject in : inMO) //read/write all inputs
                LOG.trace("ResultMerge (local, file): Merge input " + in.getVarName() + " (fname="
                        + in.getFileName() + ") via stream merge");

                JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
                Path tmpPath = new Path(in.getFileName());
                FileInputFormat.addInputPath(tmpJob, tmpPath);
                TextInputFormat informat = new TextInputFormat();
                InputSplit[] splits = informat.getSplits(tmpJob, 1);

                LongWritable key = new LongWritable();
                Text value = new Text();

                for (InputSplit split : splits) {
                    RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob,
                    try {
                        while (reader.next(key, value)) {
                            valueStr = value.toString().trim();
                            out.write(valueStr + "\n");
                    } finally {
                        if (reader != null)
        } finally {
            if (out != null)
    } catch (Exception ex) {
        throw new DMLRuntimeException("Unable to merge text cell results.", ex);

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Open Source License

 * //ww w.j a  v a2  s.  c om
 * @param fnameStaging
 * @param mo
 * @param ID
 * @throws IOException
 * @throws DMLRuntimeException

private void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID)
        throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(mo.getFileName());
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    InputSplit[] splits = informat.getSplits(job, 1);

    LinkedList<Cell> buffer = new LinkedList<Cell>();
    LongWritable key = new LongWritable();
    Text value = new Text();

    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    //long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
    //NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
    // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
    // It works fine with int row, col but we require long for larger matrices.
    // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
    // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)

    FastStringTokenizer st = new FastStringTokenizer(' ');

    for (InputSplit split : splits) {
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
        try {
            while (reader.next(key, value)) {
                st.reset(value.toString()); //reset tokenizer
                long row = st.nextLong();
                long col = st.nextLong();
                double lvalue = Double.parseDouble(st.nextToken());

                Cell tmp = new Cell(row, col, lvalue);

                if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush
                    appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);

            //final flush
            if (!buffer.isEmpty()) {
                appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
        } finally {
            if (reader != null)

From source file:com.ibm.bi.dml.runtime.io.ReaderTextCell.java

License:Open Source License

 * /*w ww .  j a v a2  s. c o  m*/
 * @param path
 * @param job
 * @param dest
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @throws IOException
 * @throws IllegalAccessException
 * @throws InstantiationException
private void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen,
        int brlen, int bclen) throws IOException {
    boolean sparse = dest.isInSparseFormat();
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    InputSplit[] splits = informat.getSplits(job, 1);

    LongWritable key = new LongWritable();
    Text value = new Text();
    int row = -1;
    int col = -1;

    try {
        FastStringTokenizer st = new FastStringTokenizer(' ');

        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);

            try {
                if (sparse) //SPARSE<-value
                    while (reader.next(key, value)) {
                        st.reset(value.toString()); //reinit tokenizer
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        double lvalue = st.nextDouble();
                        dest.appendValue(row, col, lvalue);

                } else //DENSE<-value
                    while (reader.next(key, value)) {
                        st.reset(value.toString()); //reinit tokenizer
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        double lvalue = st.nextDouble();
                        dest.setValueDenseUnsafe(row, col, lvalue);
            } finally {
                if (reader != null)
    } catch (Exception ex) {
        //post-mortem error handling and bounds checking
        if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen) {
            throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] "
                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else {
            throw new IOException("Unable to read matrix in text cell format.", ex);

From source file:com.ibm.bi.dml.runtime.io.ReaderTextCSVParallel.java

License:Open Source License

 * /*from  w w w  .  ja  v a 2s. c  o m*/
 * @param path
 * @param job
 * @param hasHeader
 * @param delim
 * @return
 * @throws IOException
 * @throws DMLRuntimeException 
private MatrixBlock computeCSVSizeAndCreateOutputMatrixBlock(InputSplit[] splits, Path path, JobConf job,
        boolean hasHeader, String delim, long estnnz) throws IOException, DMLRuntimeException {
    int nrow = 0;
    int ncol = 0;

    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();

    // count no of entities in the first non-header row
    LongWritable key = new LongWritable();
    Text oneLine = new Text();
    RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[0], job, Reporter.NULL);
    try {
        if (reader.next(key, oneLine)) {
            String cellStr = oneLine.toString().trim();
            ncol = StringUtils.countMatches(cellStr, delim) + 1;
    } finally {

    // count rows in parallel per split
    try {
        ExecutorService pool = Executors.newFixedThreadPool(_numThreads);
        ArrayList<CountRowsTask> tasks = new ArrayList<CountRowsTask>();
        for (InputSplit split : splits) {
            tasks.add(new CountRowsTask(split, informat, job, hasHeader));
            hasHeader = false;

        // collect row counts for offset computation
        // early error notify in case not all tasks successful
        _offsets = new SplitOffsetInfos(tasks.size());
        for (CountRowsTask rt : tasks) {
            if (!rt.getReturnCode())
                throw new IOException("Count task for csv input failed: " + rt.getErrMsg());
            _offsets.setOffsetPerSplit(tasks.indexOf(rt), nrow);
            _offsets.setLenghtPerSplit(tasks.indexOf(rt), rt.getRowCount());
            nrow = nrow + rt.getRowCount();
    } catch (Exception e) {
        throw new IOException("Threadpool Error " + e.getMessage(), e);

    // allocate target matrix block based on given size; 
    // need to allocate sparse as well since lock-free insert into target
    return createOutputMatrixBlock(nrow, ncol, estnnz, true, true);

From source file:com.ibm.bi.dml.udf.lib.RemoveEmptyRows.java

License:Open Source License

public void execute() {
    Matrix mat = (Matrix) this.getFunctionInput(0);
    String fnameOld = mat.getFilePath();

    HashMap<Long, Long> keyMap = new HashMap<Long, Long>(); //old,new rowID

    try {/*w w  w.j a va  2  s.  c o m*/
        //prepare input
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fnameOld);
        FileSystem fs = FileSystem.get(job);
        if (!fs.exists(path))
            throw new IOException("File " + fnameOld + " does not exist on HDFS.");
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();

        //prepare output
        String fnameNew = createOutputFilePathAndName(OUTPUT_FILE);
        DataOutputStream ostream = MapReduceTool.getHDFSDataOutputStream(fnameNew, true);

        //read and write if necessary
        InputSplit[] splits = informat.getSplits(job, 1);

        LongWritable key = new LongWritable();
        Text value = new Text();
        long ID = 1;

        try {
            //for obj reuse and preventing repeated buffer re-allocations
            StringBuilder sb = new StringBuilder();

            for (InputSplit split : splits) {
                RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
                try {
                    while (reader.next(key, value)) {
                        String cellStr = value.toString().trim();
                        StringTokenizer st = new StringTokenizer(cellStr, " ");
                        long row = Integer.parseInt(st.nextToken());
                        long col = Integer.parseInt(st.nextToken());
                        double lvalue = Double.parseDouble(st.nextToken());

                        if (!keyMap.containsKey(row))
                            keyMap.put(row, ID++);
                        long rowNew = keyMap.get(row);

                        sb.append(' ');
                        sb.append(' ');

                } finally {
                    if (reader != null)

            _ret = new Matrix(fnameNew, keyMap.size(), mat.getNumCols(), ValueType.Double);
        } finally {
            if (ostream != null)
    } catch (Exception ex) {
        throw new RuntimeException("Unable to execute external function.", ex);

From source file:org.apache.drill.exec.store.text.DrillTextRecordReader.java

License:Apache License

public DrillTextRecordReader(FileSplit split, Configuration fsConf, FragmentContext context, char delimiter,
        List<SchemaPath> columns) {
    this.delimiter = (byte) delimiter;
    this.split = split;
    setColumns(columns);/* w  w w  .ja  v a 2  s  . com*/

    if (!isStarQuery()) {
        String pathStr;
        for (SchemaPath path : columns) {
            assert path.getRootSegment().isNamed();
            pathStr = path.getRootSegment().getPath();
                            || (pathStr.equals("*") && path.getRootSegment().getChild() == null),
                    "Selected column(s) must have name 'columns' or must be plain '*'");

            if (path.getRootSegment().getChild() != null) {
                        "Selected column must be an array index");
                int index = path.getRootSegment().getChild().getArraySegment().getIndex();
        numCols = columnIds.size();

    TextInputFormat inputFormat = new TextInputFormat();
    JobConf job = new JobConf(fsConf);
    job.setInt("io.file.buffer.size", context.getConfig().getInt(ExecConstants.TEXT_LINE_READER_BUFFER_SIZE));
    try {
        reader = inputFormat.getRecordReader(split, job, Reporter.NULL);
        key = reader.createKey();
        value = reader.createValue();
        totalRecordsRead = 0;
    } catch (Exception e) {
        handleAndRaise("Failure in creating record reader", e);

From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java

License:Apache License

protected void runJob(JobConf job) throws IOException {
    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, job.getNumMapTasks());
    log.debug("Nb splits : {}", splits.length);

    InputSplit[] sorted = Arrays.copyOf(splits, splits.length);
    Builder.sortSplits(sorted);/*from  w  ww  .j av a  2  s.  c  o m*/

    int numTrees = Builder.getNbTrees(job); // total number of trees

    firstOutput = new PartialOutputCollector(numTrees);
    Reporter reporter = Reporter.NULL;

    firstIds = new int[splits.length];
    sizes = new int[splits.length];

    // to compute firstIds, process the splits in file order
    int firstId = 0;
    long slowest = 0; // duration of slowest map
    for (InputSplit split : splits) {
        int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition

        RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

        LongWritable key = reader.createKey();
        Text value = reader.createValue();

        Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, splits.length,

        long time = System.currentTimeMillis();

        firstIds[hp] = firstId;

        while (reader.next(key, value)) {
            mapper.map(key, value, firstOutput, reporter);


        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));

From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java

License:Apache License

 * The second step uses the trees to predict the rest of the instances outside
 * their own partition//from  ww  w .j a v  a  2  s  .  c  o m
 * @throws IOException
void secondStep(JobConf job, Path forestPath, PredictionCallback callback) throws IOException {
    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, job.getNumMapTasks());
    log.debug("Nb splits : {}", splits.length);


    int numTrees = Builder.getNbTrees(job); // total number of trees

    // compute the expected number of outputs
    int total = 0;
    for (int p = 0; p < splits.length; p++) {
        total += Step2Mapper.nbConcerned(splits.length, numTrees, p);

    secondOutput = new PartialOutputCollector(total);
    Reporter reporter = Reporter.NULL;
    long slowest = 0; // duration of slowest map

    for (int partition = 0; partition < splits.length; partition++) {
        InputSplit split = splits[partition];
        RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

        LongWritable key = reader.createKey();
        Text value = reader.createValue();

        // load the output of the 1st step
        int nbConcerned = Step2Mapper.nbConcerned(splits.length, numTrees, partition);
        TreeID[] fsKeys = new TreeID[nbConcerned];
        Node[] fsTrees = new Node[nbConcerned];

        FileSystem fs = forestPath.getFileSystem(job);
        int numInstances = InterResults.load(fs, forestPath, splits.length, numTrees, partition, fsKeys,

        Step2Mapper mapper = new Step2Mapper();
        mapper.configure(partition, dataset, fsKeys, fsTrees, numInstances);

        long time = System.currentTimeMillis();

        while (reader.next(key, value)) {
            mapper.map(key, value, secondOutput, reporter);


        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));