Example usage for org.apache.hadoop.mapred TextInputFormat getRecordReader

List of usage examples for org.apache.hadoop.mapred TextInputFormat getRecordReader

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred TextInputFormat getRecordReader.

Prototype

public RecordReader<LongWritable, Text> getRecordReader(InputSplit genericSplit, JobConf job, Reporter reporter)
            throws IOException 

Source Link

Usage

From source file:com.cloudera.knittingboar.io.TestSplitCalcs.java

License:Apache License

/**
 * /*from  w w w. j  ava 2 s  .  co m*/
 * - use the TextInputFormat.getSplits() to test pulling split info
 * @throws IOException 
 * 
 */
public void testGetSplits() throws IOException {

    TextInputFormat input = new TextInputFormat();

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testGetSplits.txt");

    int tmp_file_size = 200000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
        for (int i = 0; i < tmp_file_size; i++) {
            writer.write(
                    "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
            writer.write("\n");
        }
    } finally {
        writer.close();
    }

    System.out.println("file write complete");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;

    //    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    assertEquals(2, splits.length);

    System.out.println("---- debug splits --------- ");

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength() + ", " + splits[x].toString() + ", "
                + splits[x].getLocations()[0]);

        RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[x], job, reporter);
        try {
            int count = 0;
            while (reader.next(key, value)) {

                if (count == 0) {
                    System.out.println("first: " + value.toString());
                    assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p"));
                }

                count++;
            }

            System.out.println("last: " + value.toString());

            assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p"));

        } finally {
            reader.close();
        }

    } // for each split

}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerLocal.java

License:Open Source License

/**
 * //from www.j  av a 2s .  c  o  m
 * @param fname
 * @param fnameStaging
 * @param fnameNew
 * @param brlen
 * @param bclen
 * @throws DMLRuntimeException
 */
private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen,
        int brlen, int bclen) throws DMLRuntimeException {
    long row = -1;
    long col = -1;

    try {
        //STEP 1: read matrix from HDFS and write blocks to local staging area         
        //check and add input path
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fname);
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);
        InputSplit[] splits = informat.getSplits(job, 1);

        LinkedList<Cell> buffer = new LinkedList<Cell>();
        LongWritable key = new LongWritable();
        Text value = new Text();
        FastStringTokenizer st = new FastStringTokenizer(' ');

        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
            try {
                while (reader.next(key, value)) {
                    st.reset(value.toString()); //reset tokenizer
                    row = st.nextLong();
                    col = st.nextLong();
                    double lvalue = st.nextDouble();
                    Cell tmp = new Cell(row, col, lvalue);

                    buffer.addLast(tmp);
                    if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush
                    {
                        appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                        buffer.clear();
                    }
                }

                //final flush
                if (!buffer.isEmpty()) {
                    appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen);
                    buffer.clear();
                }
            } finally {
                if (reader != null)
                    reader.close();
            }
        }

        //STEP 2: read matrix blocks from staging area and write matrix to HDFS
        String[] fnamesPartitions = new File(fnameStaging).list();
        if (PARALLEL) {
            int len = Math.min(fnamesPartitions.length, _par);
            Thread[] threads = new Thread[len];
            for (int i = 0; i < len; i++) {
                int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len);
                int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1;
                end = Math.min(end, fnamesPartitions.length - 1);
                threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging,
                        fnamesPartitions, start, end));
                threads[i].start();
            }

            for (Thread t : threads)
                t.join();
        } else {
            for (String pdir : fnamesPartitions)
                writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir);
        }
    } catch (Exception e) {
        //post-mortem error handling and bounds checking
        if (row < 1 || row > rlen || col < 1 || col > clen) {
            throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] "
                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else
            throw new DMLRuntimeException("Unable to partition text cell matrix.", e);
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Open Source License

/**
 * /*from ww w . j a v a 2  s  . c  o m*/
 * @param fnameNew
 * @param outMo
 * @param inMO
 * @throws DMLRuntimeException
 */
private void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO)
        throws DMLRuntimeException {
    try {
        //delete target file if already exists
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);

        if (ALLOW_COPY_CELLFILES) {
            copyAllFiles(fnameNew, inMO);
            return; //we're done
        }

        //actual merge
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        FileSystem fs = FileSystem.get(job);
        Path path = new Path(fnameNew);
        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));

        String valueStr = null;

        try {
            for (MatrixObject in : inMO) //read/write all inputs
            {
                LOG.trace("ResultMerge (local, file): Merge input " + in.getVarName() + " (fname="
                        + in.getFileName() + ") via stream merge");

                JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf());
                Path tmpPath = new Path(in.getFileName());
                FileInputFormat.addInputPath(tmpJob, tmpPath);
                TextInputFormat informat = new TextInputFormat();
                informat.configure(tmpJob);
                InputSplit[] splits = informat.getSplits(tmpJob, 1);

                LongWritable key = new LongWritable();
                Text value = new Text();

                for (InputSplit split : splits) {
                    RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob,
                            Reporter.NULL);
                    try {
                        while (reader.next(key, value)) {
                            valueStr = value.toString().trim();
                            out.write(valueStr + "\n");
                        }
                    } finally {
                        if (reader != null)
                            reader.close();
                    }
                }
            }
        } finally {
            if (out != null)
                out.close();
        }
    } catch (Exception ex) {
        throw new DMLRuntimeException("Unable to merge text cell results.", ex);
    }
}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java

License:Open Source License

/**
 * //ww w.j a  v a2  s.  c om
 * @param fnameStaging
 * @param mo
 * @param ID
 * @throws IOException
 * @throws DMLRuntimeException
 */

private void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID)
        throws IOException, DMLRuntimeException {
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    Path path = new Path(mo.getFileName());
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);

    LinkedList<Cell> buffer = new LinkedList<Cell>();
    LongWritable key = new LongWritable();
    Text value = new Text();

    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    //long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively
    //NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation
    // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit.
    // It works fine with int row, col but we require long for larger matrices.
    // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell)
    // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0)

    FastStringTokenizer st = new FastStringTokenizer(' ');

    for (InputSplit split : splits) {
        RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
        try {
            while (reader.next(key, value)) {
                st.reset(value.toString()); //reset tokenizer
                long row = st.nextLong();
                long col = st.nextLong();
                double lvalue = Double.parseDouble(st.nextToken());

                Cell tmp = new Cell(row, col, lvalue);

                buffer.addLast(tmp);
                if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush
                {
                    appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                    buffer.clear();
                }
            }

            //final flush
            if (!buffer.isEmpty()) {
                appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen);
                buffer.clear();
            }
        } finally {
            if (reader != null)
                reader.close();
        }
    }
}

From source file:com.ibm.bi.dml.runtime.io.ReaderTextCell.java

License:Open Source License

/**
 * /*w ww .  j a v a2  s. c o  m*/
 * @param path
 * @param job
 * @param dest
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @throws IOException
 * @throws IllegalAccessException
 * @throws InstantiationException
 */
private void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen,
        int brlen, int bclen) throws IOException {
    boolean sparse = dest.isInSparseFormat();
    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);
    InputSplit[] splits = informat.getSplits(job, 1);

    LongWritable key = new LongWritable();
    Text value = new Text();
    int row = -1;
    int col = -1;

    try {
        FastStringTokenizer st = new FastStringTokenizer(' ');

        for (InputSplit split : splits) {
            RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);

            try {
                if (sparse) //SPARSE<-value
                {
                    while (reader.next(key, value)) {
                        st.reset(value.toString()); //reinit tokenizer
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        double lvalue = st.nextDouble();
                        dest.appendValue(row, col, lvalue);
                    }

                    dest.sortSparseRows();
                } else //DENSE<-value
                {
                    while (reader.next(key, value)) {
                        st.reset(value.toString()); //reinit tokenizer
                        row = st.nextInt() - 1;
                        col = st.nextInt() - 1;
                        double lvalue = st.nextDouble();
                        dest.setValueDenseUnsafe(row, col, lvalue);
                    }
                }
            } finally {
                if (reader != null)
                    reader.close();
            }
        }
    } catch (Exception ex) {
        //post-mortem error handling and bounds checking
        if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen) {
            throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] "
                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        } else {
            throw new IOException("Unable to read matrix in text cell format.", ex);
        }
    }
}

From source file:com.ibm.bi.dml.runtime.io.ReaderTextCSVParallel.java

License:Open Source License

/**
 * /*from  w w w  .  ja  v a 2s. c  o m*/
 * @param path
 * @param job
 * @param hasHeader
 * @param delim
 * @return
 * @throws IOException
 * @throws DMLRuntimeException 
 */
private MatrixBlock computeCSVSizeAndCreateOutputMatrixBlock(InputSplit[] splits, Path path, JobConf job,
        boolean hasHeader, String delim, long estnnz) throws IOException, DMLRuntimeException {
    int nrow = 0;
    int ncol = 0;

    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);

    // count no of entities in the first non-header row
    LongWritable key = new LongWritable();
    Text oneLine = new Text();
    RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[0], job, Reporter.NULL);
    try {
        if (reader.next(key, oneLine)) {
            String cellStr = oneLine.toString().trim();
            ncol = StringUtils.countMatches(cellStr, delim) + 1;
        }
    } finally {
        IOUtilFunctions.closeSilently(reader);
    }

    // count rows in parallel per split
    try {
        ExecutorService pool = Executors.newFixedThreadPool(_numThreads);
        ArrayList<CountRowsTask> tasks = new ArrayList<CountRowsTask>();
        for (InputSplit split : splits) {
            tasks.add(new CountRowsTask(split, informat, job, hasHeader));
            hasHeader = false;
        }
        pool.invokeAll(tasks);
        pool.shutdown();

        // collect row counts for offset computation
        // early error notify in case not all tasks successful
        _offsets = new SplitOffsetInfos(tasks.size());
        for (CountRowsTask rt : tasks) {
            if (!rt.getReturnCode())
                throw new IOException("Count task for csv input failed: " + rt.getErrMsg());
            _offsets.setOffsetPerSplit(tasks.indexOf(rt), nrow);
            _offsets.setLenghtPerSplit(tasks.indexOf(rt), rt.getRowCount());
            nrow = nrow + rt.getRowCount();
        }
    } catch (Exception e) {
        throw new IOException("Threadpool Error " + e.getMessage(), e);
    }

    // allocate target matrix block based on given size; 
    // need to allocate sparse as well since lock-free insert into target
    return createOutputMatrixBlock(nrow, ncol, estnnz, true, true);
}

From source file:com.ibm.bi.dml.udf.lib.RemoveEmptyRows.java

License:Open Source License

@Override
public void execute() {
    Matrix mat = (Matrix) this.getFunctionInput(0);
    String fnameOld = mat.getFilePath();

    HashMap<Long, Long> keyMap = new HashMap<Long, Long>(); //old,new rowID

    try {/*w w  w.j a va  2  s.  c o m*/
        //prepare input
        JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
        Path path = new Path(fnameOld);
        FileSystem fs = FileSystem.get(job);
        if (!fs.exists(path))
            throw new IOException("File " + fnameOld + " does not exist on HDFS.");
        FileInputFormat.addInputPath(job, path);
        TextInputFormat informat = new TextInputFormat();
        informat.configure(job);

        //prepare output
        String fnameNew = createOutputFilePathAndName(OUTPUT_FILE);
        DataOutputStream ostream = MapReduceTool.getHDFSDataOutputStream(fnameNew, true);

        //read and write if necessary
        InputSplit[] splits = informat.getSplits(job, 1);

        LongWritable key = new LongWritable();
        Text value = new Text();
        long ID = 1;

        try {
            //for obj reuse and preventing repeated buffer re-allocations
            StringBuilder sb = new StringBuilder();

            for (InputSplit split : splits) {
                RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL);
                try {
                    while (reader.next(key, value)) {
                        String cellStr = value.toString().trim();
                        StringTokenizer st = new StringTokenizer(cellStr, " ");
                        long row = Integer.parseInt(st.nextToken());
                        long col = Integer.parseInt(st.nextToken());
                        double lvalue = Double.parseDouble(st.nextToken());

                        if (!keyMap.containsKey(row))
                            keyMap.put(row, ID++);
                        long rowNew = keyMap.get(row);

                        sb.append(rowNew);
                        sb.append(' ');
                        sb.append(col);
                        sb.append(' ');
                        sb.append(lvalue);
                        sb.append('\n');

                        ostream.writeBytes(sb.toString());
                        sb.setLength(0);
                    }
                } finally {
                    if (reader != null)
                        reader.close();
                }
            }

            _ret = new Matrix(fnameNew, keyMap.size(), mat.getNumCols(), ValueType.Double);
        } finally {
            if (ostream != null)
                ostream.close();
        }
    } catch (Exception ex) {
        throw new RuntimeException("Unable to execute external function.", ex);
    }
}

From source file:org.apache.drill.exec.store.text.DrillTextRecordReader.java

License:Apache License

public DrillTextRecordReader(FileSplit split, Configuration fsConf, FragmentContext context, char delimiter,
        List<SchemaPath> columns) {
    this.delimiter = (byte) delimiter;
    this.split = split;
    setColumns(columns);/* w  w w  .ja  v a 2  s  . com*/

    if (!isStarQuery()) {
        String pathStr;
        for (SchemaPath path : columns) {
            assert path.getRootSegment().isNamed();
            pathStr = path.getRootSegment().getPath();
            Preconditions.checkArgument(
                    pathStr.equals(COL_NAME)
                            || (pathStr.equals("*") && path.getRootSegment().getChild() == null),
                    "Selected column(s) must have name 'columns' or must be plain '*'");

            if (path.getRootSegment().getChild() != null) {
                Preconditions.checkArgument(path.getRootSegment().getChild().isArray(),
                        "Selected column must be an array index");
                int index = path.getRootSegment().getChild().getArraySegment().getIndex();
                columnIds.add(index);
            }
        }
        Collections.sort(columnIds);
        numCols = columnIds.size();
    }

    TextInputFormat inputFormat = new TextInputFormat();
    JobConf job = new JobConf(fsConf);
    job.setInt("io.file.buffer.size", context.getConfig().getInt(ExecConstants.TEXT_LINE_READER_BUFFER_SIZE));
    job.setInputFormat(inputFormat.getClass());
    try {
        reader = inputFormat.getRecordReader(split, job, Reporter.NULL);
        key = reader.createKey();
        value = reader.createValue();
        totalRecordsRead = 0;
    } catch (Exception e) {
        handleAndRaise("Failure in creating record reader", e);
    }
}

From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java

License:Apache License

@Override
protected void runJob(JobConf job) throws IOException {
    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, job.getNumMapTasks());
    log.debug("Nb splits : {}", splits.length);

    InputSplit[] sorted = Arrays.copyOf(splits, splits.length);
    Builder.sortSplits(sorted);/*from  w  ww  .j av a  2  s.  c  o m*/

    int numTrees = Builder.getNbTrees(job); // total number of trees

    firstOutput = new PartialOutputCollector(numTrees);
    Reporter reporter = Reporter.NULL;

    firstIds = new int[splits.length];
    sizes = new int[splits.length];

    // to compute firstIds, process the splits in file order
    int firstId = 0;
    long slowest = 0; // duration of slowest map
    for (InputSplit split : splits) {
        int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition

        RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

        LongWritable key = reader.createKey();
        Text value = reader.createValue();

        Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, splits.length,
                numTrees);

        long time = System.currentTimeMillis();

        firstIds[hp] = firstId;

        while (reader.next(key, value)) {
            mapper.map(key, value, firstOutput, reporter);
            firstId++;
            sizes[hp]++;
        }

        mapper.close();

        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;
        }
    }

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
}

From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java

License:Apache License

/**
 * The second step uses the trees to predict the rest of the instances outside
 * their own partition//from  ww  w .j a v  a  2  s  .  c  o m
 * 
 * @throws IOException
 * 
 */
void secondStep(JobConf job, Path forestPath, PredictionCallback callback) throws IOException {
    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, job.getNumMapTasks());
    log.debug("Nb splits : {}", splits.length);

    Builder.sortSplits(splits);

    int numTrees = Builder.getNbTrees(job); // total number of trees

    // compute the expected number of outputs
    int total = 0;
    for (int p = 0; p < splits.length; p++) {
        total += Step2Mapper.nbConcerned(splits.length, numTrees, p);
    }

    secondOutput = new PartialOutputCollector(total);
    Reporter reporter = Reporter.NULL;
    long slowest = 0; // duration of slowest map

    for (int partition = 0; partition < splits.length; partition++) {
        InputSplit split = splits[partition];
        RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

        LongWritable key = reader.createKey();
        Text value = reader.createValue();

        // load the output of the 1st step
        int nbConcerned = Step2Mapper.nbConcerned(splits.length, numTrees, partition);
        TreeID[] fsKeys = new TreeID[nbConcerned];
        Node[] fsTrees = new Node[nbConcerned];

        FileSystem fs = forestPath.getFileSystem(job);
        int numInstances = InterResults.load(fs, forestPath, splits.length, numTrees, partition, fsKeys,
                fsTrees);

        Step2Mapper mapper = new Step2Mapper();
        mapper.configure(partition, dataset, fsKeys, fsTrees, numInstances);

        long time = System.currentTimeMillis();

        while (reader.next(key, value)) {
            mapper.map(key, value, secondOutput, reporter);
        }

        mapper.close();

        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;
        }
    }

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
}