List of usage examples for org.apache.hadoop.mapred TextInputFormat getRecordReader
public RecordReader<LongWritable, Text> getRecordReader(InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException
From source file:com.cloudera.knittingboar.io.TestSplitCalcs.java
License:Apache License
/** * /*from w w w. j ava 2 s . co m*/ * - use the TextInputFormat.getSplits() to test pulling split info * @throws IOException * */ public void testGetSplits() throws IOException { TextInputFormat input = new TextInputFormat(); JobConf job = new JobConf(defaultConf); Path file = new Path(workDir, "testGetSplits.txt"); int tmp_file_size = 200000; long block_size = localFs.getDefaultBlockSize(); System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB"); Writer writer = new OutputStreamWriter(localFs.create(file)); try { for (int i = 0; i < tmp_file_size; i++) { writer.write( "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99"); writer.write("\n"); } } finally { writer.close(); } System.out.println("file write complete"); // A reporter that does nothing Reporter reporter = Reporter.NULL; // localFs.delete(workDir, true); FileInputFormat.setInputPaths(job, file); // try splitting the file in a variety of sizes TextInputFormat format = new TextInputFormat(); format.configure(job); LongWritable key = new LongWritable(); Text value = new Text(); int numSplits = 1; InputSplit[] splits = format.getSplits(job, numSplits); LOG.info("requested " + numSplits + " splits, splitting: got = " + splits.length); assertEquals(2, splits.length); System.out.println("---- debug splits --------- "); for (int x = 0; x < splits.length; x++) { System.out.println("> Split [" + x + "]: " + splits[x].getLength() + ", " + splits[x].toString() + ", " + splits[x].getLocations()[0]); RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[x], job, reporter); try { int count = 0; while (reader.next(key, value)) { if (count == 0) { System.out.println("first: " + value.toString()); assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p")); } count++; } System.out.println("last: " + value.toString()); assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p")); } finally { reader.close(); } } // for each split }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerLocal.java
License:Open Source License
/** * //from www.j av a 2s . c o m * @param fname * @param fnameStaging * @param fnameNew * @param brlen * @param bclen * @throws DMLRuntimeException */ private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { long row = -1; long col = -1; try { //STEP 1: read matrix from HDFS and write blocks to local staging area //check and add input path JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fname); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); LinkedList<Cell> buffer = new LinkedList<Cell>(); LongWritable key = new LongWritable(); Text value = new Text(); FastStringTokenizer st = new FastStringTokenizer(' '); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { while (reader.next(key, value)) { st.reset(value.toString()); //reset tokenizer row = st.nextLong(); col = st.nextLong(); double lvalue = st.nextDouble(); Cell tmp = new Cell(row, col, lvalue); buffer.addLast(tmp); if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush { appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen); buffer.clear(); } } //final flush if (!buffer.isEmpty()) { appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen); buffer.clear(); } } finally { if (reader != null) reader.close(); } } //STEP 2: read matrix blocks from staging area and write matrix to HDFS String[] fnamesPartitions = new File(fnameStaging).list(); if (PARALLEL) { int len = Math.min(fnamesPartitions.length, _par); Thread[] threads = new Thread[len]; for (int i = 0; i < len; i++) { int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len); int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1; end = Math.min(end, fnamesPartitions.length - 1); threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end)); threads[i].start(); } for (Thread t : threads) t.join(); } else { for (String pdir : fnamesPartitions) writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir); } } catch (Exception e) { //post-mortem error handling and bounds checking if (row < 1 || row > rlen || col < 1 || col > clen) { throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } else throw new DMLRuntimeException("Unable to partition text cell matrix.", e); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java
License:Open Source License
/** * /*from ww w . j a v a 2 s . c o m*/ * @param fnameNew * @param outMo * @param inMO * @throws DMLRuntimeException */ private void mergeTextCellWithoutComp(String fnameNew, MatrixObject outMo, ArrayList<MatrixObject> inMO) throws DMLRuntimeException { try { //delete target file if already exists MapReduceTool.deleteFileIfExistOnHDFS(fnameNew); if (ALLOW_COPY_CELLFILES) { copyAllFiles(fnameNew, inMO); return; //we're done } //actual merge JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); Path path = new Path(fnameNew); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fs.create(path, true))); String valueStr = null; try { for (MatrixObject in : inMO) //read/write all inputs { LOG.trace("ResultMerge (local, file): Merge input " + in.getVarName() + " (fname=" + in.getFileName() + ") via stream merge"); JobConf tmpJob = new JobConf(ConfigurationManager.getCachedJobConf()); Path tmpPath = new Path(in.getFileName()); FileInputFormat.addInputPath(tmpJob, tmpPath); TextInputFormat informat = new TextInputFormat(); informat.configure(tmpJob); InputSplit[] splits = informat.getSplits(tmpJob, 1); LongWritable key = new LongWritable(); Text value = new Text(); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, tmpJob, Reporter.NULL); try { while (reader.next(key, value)) { valueStr = value.toString().trim(); out.write(valueStr + "\n"); } } finally { if (reader != null) reader.close(); } } } } finally { if (out != null) out.close(); } } catch (Exception ex) { throw new DMLRuntimeException("Unable to merge text cell results.", ex); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeLocalFile.java
License:Open Source License
/** * //ww w.j a v a2 s. c om * @param fnameStaging * @param mo * @param ID * @throws IOException * @throws DMLRuntimeException */ private void createTextCellStagingFile(String fnameStaging, MatrixObject mo, long ID) throws IOException, DMLRuntimeException { JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(mo.getFileName()); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); LinkedList<Cell> buffer = new LinkedList<Cell>(); LongWritable key = new LongWritable(); Text value = new Text(); MatrixCharacteristics mc = mo.getMatrixCharacteristics(); int brlen = mc.getRowsPerBlock(); int bclen = mc.getColsPerBlock(); //long row = -1, col = -1; //FIXME needs reconsideration whenever textcell is used actively //NOTE MB: Originally, we used long row, col but this led reproducibly to JIT compilation // errors during runtime; experienced under WINDOWS, Intel x86-64, IBM JDK 64bit/32bit. // It works fine with int row, col but we require long for larger matrices. // Since, textcell is never used for result merge (hybrid/hadoop: binaryblock, singlenode:binarycell) // we just propose the to exclude it with -Xjit:exclude={package.method*}(count=0,optLevel=0) FastStringTokenizer st = new FastStringTokenizer(' '); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { while (reader.next(key, value)) { st.reset(value.toString()); //reset tokenizer long row = st.nextLong(); long col = st.nextLong(); double lvalue = Double.parseDouble(st.nextToken()); Cell tmp = new Cell(row, col, lvalue); buffer.addLast(tmp); if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush { appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen); buffer.clear(); } } //final flush if (!buffer.isEmpty()) { appendCellBufferToStagingArea(fnameStaging, ID, buffer, brlen, bclen); buffer.clear(); } } finally { if (reader != null) reader.close(); } } }
From source file:com.ibm.bi.dml.runtime.io.ReaderTextCell.java
License:Open Source License
/** * /*w ww . j a v a2 s. c o m*/ * @param path * @param job * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws IllegalAccessException * @throws InstantiationException */ private void readTextCellMatrixFromHDFS(Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int brlen, int bclen) throws IOException { boolean sparse = dest.isInSparseFormat(); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); LongWritable key = new LongWritable(); Text value = new Text(); int row = -1; int col = -1; try { FastStringTokenizer st = new FastStringTokenizer(' '); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { if (sparse) //SPARSE<-value { while (reader.next(key, value)) { st.reset(value.toString()); //reinit tokenizer row = st.nextInt() - 1; col = st.nextInt() - 1; double lvalue = st.nextDouble(); dest.appendValue(row, col, lvalue); } dest.sortSparseRows(); } else //DENSE<-value { while (reader.next(key, value)) { st.reset(value.toString()); //reinit tokenizer row = st.nextInt() - 1; col = st.nextInt() - 1; double lvalue = st.nextDouble(); dest.setValueDenseUnsafe(row, col, lvalue); } } } finally { if (reader != null) reader.close(); } } } catch (Exception ex) { //post-mortem error handling and bounds checking if (row < 0 || row + 1 > rlen || col < 0 || col + 1 > clen) { throw new IOException("Matrix cell [" + (row + 1) + "," + (col + 1) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } else { throw new IOException("Unable to read matrix in text cell format.", ex); } } }
From source file:com.ibm.bi.dml.runtime.io.ReaderTextCSVParallel.java
License:Open Source License
/** * /*from w w w . ja v a 2s. c o m*/ * @param path * @param job * @param hasHeader * @param delim * @return * @throws IOException * @throws DMLRuntimeException */ private MatrixBlock computeCSVSizeAndCreateOutputMatrixBlock(InputSplit[] splits, Path path, JobConf job, boolean hasHeader, String delim, long estnnz) throws IOException, DMLRuntimeException { int nrow = 0; int ncol = 0; FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); // count no of entities in the first non-header row LongWritable key = new LongWritable(); Text oneLine = new Text(); RecordReader<LongWritable, Text> reader = informat.getRecordReader(splits[0], job, Reporter.NULL); try { if (reader.next(key, oneLine)) { String cellStr = oneLine.toString().trim(); ncol = StringUtils.countMatches(cellStr, delim) + 1; } } finally { IOUtilFunctions.closeSilently(reader); } // count rows in parallel per split try { ExecutorService pool = Executors.newFixedThreadPool(_numThreads); ArrayList<CountRowsTask> tasks = new ArrayList<CountRowsTask>(); for (InputSplit split : splits) { tasks.add(new CountRowsTask(split, informat, job, hasHeader)); hasHeader = false; } pool.invokeAll(tasks); pool.shutdown(); // collect row counts for offset computation // early error notify in case not all tasks successful _offsets = new SplitOffsetInfos(tasks.size()); for (CountRowsTask rt : tasks) { if (!rt.getReturnCode()) throw new IOException("Count task for csv input failed: " + rt.getErrMsg()); _offsets.setOffsetPerSplit(tasks.indexOf(rt), nrow); _offsets.setLenghtPerSplit(tasks.indexOf(rt), rt.getRowCount()); nrow = nrow + rt.getRowCount(); } } catch (Exception e) { throw new IOException("Threadpool Error " + e.getMessage(), e); } // allocate target matrix block based on given size; // need to allocate sparse as well since lock-free insert into target return createOutputMatrixBlock(nrow, ncol, estnnz, true, true); }
From source file:com.ibm.bi.dml.udf.lib.RemoveEmptyRows.java
License:Open Source License
@Override public void execute() { Matrix mat = (Matrix) this.getFunctionInput(0); String fnameOld = mat.getFilePath(); HashMap<Long, Long> keyMap = new HashMap<Long, Long>(); //old,new rowID try {/*w w w.j a va 2 s. c o m*/ //prepare input JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fnameOld); FileSystem fs = FileSystem.get(job); if (!fs.exists(path)) throw new IOException("File " + fnameOld + " does not exist on HDFS."); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); //prepare output String fnameNew = createOutputFilePathAndName(OUTPUT_FILE); DataOutputStream ostream = MapReduceTool.getHDFSDataOutputStream(fnameNew, true); //read and write if necessary InputSplit[] splits = informat.getSplits(job, 1); LongWritable key = new LongWritable(); Text value = new Text(); long ID = 1; try { //for obj reuse and preventing repeated buffer re-allocations StringBuilder sb = new StringBuilder(); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { while (reader.next(key, value)) { String cellStr = value.toString().trim(); StringTokenizer st = new StringTokenizer(cellStr, " "); long row = Integer.parseInt(st.nextToken()); long col = Integer.parseInt(st.nextToken()); double lvalue = Double.parseDouble(st.nextToken()); if (!keyMap.containsKey(row)) keyMap.put(row, ID++); long rowNew = keyMap.get(row); sb.append(rowNew); sb.append(' '); sb.append(col); sb.append(' '); sb.append(lvalue); sb.append('\n'); ostream.writeBytes(sb.toString()); sb.setLength(0); } } finally { if (reader != null) reader.close(); } } _ret = new Matrix(fnameNew, keyMap.size(), mat.getNumCols(), ValueType.Double); } finally { if (ostream != null) ostream.close(); } } catch (Exception ex) { throw new RuntimeException("Unable to execute external function.", ex); } }
From source file:org.apache.drill.exec.store.text.DrillTextRecordReader.java
License:Apache License
public DrillTextRecordReader(FileSplit split, Configuration fsConf, FragmentContext context, char delimiter, List<SchemaPath> columns) { this.delimiter = (byte) delimiter; this.split = split; setColumns(columns);/* w w w .ja v a 2 s . com*/ if (!isStarQuery()) { String pathStr; for (SchemaPath path : columns) { assert path.getRootSegment().isNamed(); pathStr = path.getRootSegment().getPath(); Preconditions.checkArgument( pathStr.equals(COL_NAME) || (pathStr.equals("*") && path.getRootSegment().getChild() == null), "Selected column(s) must have name 'columns' or must be plain '*'"); if (path.getRootSegment().getChild() != null) { Preconditions.checkArgument(path.getRootSegment().getChild().isArray(), "Selected column must be an array index"); int index = path.getRootSegment().getChild().getArraySegment().getIndex(); columnIds.add(index); } } Collections.sort(columnIds); numCols = columnIds.size(); } TextInputFormat inputFormat = new TextInputFormat(); JobConf job = new JobConf(fsConf); job.setInt("io.file.buffer.size", context.getConfig().getInt(ExecConstants.TEXT_LINE_READER_BUFFER_SIZE)); job.setInputFormat(inputFormat.getClass()); try { reader = inputFormat.getRecordReader(split, job, Reporter.NULL); key = reader.createKey(); value = reader.createValue(); totalRecordsRead = 0; } catch (Exception e) { handleAndRaise("Failure in creating record reader", e); } }
From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java
License:Apache License
@Override protected void runJob(JobConf job) throws IOException { // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, job.getNumMapTasks()); log.debug("Nb splits : {}", splits.length); InputSplit[] sorted = Arrays.copyOf(splits, splits.length); Builder.sortSplits(sorted);/*from w ww .j av a 2 s. c o m*/ int numTrees = Builder.getNbTrees(job); // total number of trees firstOutput = new PartialOutputCollector(numTrees); Reporter reporter = Reporter.NULL; firstIds = new int[splits.length]; sizes = new int[splits.length]; // to compute firstIds, process the splits in file order int firstId = 0; long slowest = 0; // duration of slowest map for (InputSplit split : splits) { int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, splits.length, numTrees); long time = System.currentTimeMillis(); firstIds[hp] = firstId; while (reader.next(key, value)) { mapper.map(key, value, firstOutput, reporter); firstId++; sizes[hp]++; } mapper.close(); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); }
From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java
License:Apache License
/** * The second step uses the trees to predict the rest of the instances outside * their own partition//from ww w .j a v a 2 s . c o m * * @throws IOException * */ void secondStep(JobConf job, Path forestPath, PredictionCallback callback) throws IOException { // retrieve the splits TextInputFormat input = (TextInputFormat) job.getInputFormat(); InputSplit[] splits = input.getSplits(job, job.getNumMapTasks()); log.debug("Nb splits : {}", splits.length); Builder.sortSplits(splits); int numTrees = Builder.getNbTrees(job); // total number of trees // compute the expected number of outputs int total = 0; for (int p = 0; p < splits.length; p++) { total += Step2Mapper.nbConcerned(splits.length, numTrees, p); } secondOutput = new PartialOutputCollector(total); Reporter reporter = Reporter.NULL; long slowest = 0; // duration of slowest map for (int partition = 0; partition < splits.length; partition++) { InputSplit split = splits[partition]; RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter); LongWritable key = reader.createKey(); Text value = reader.createValue(); // load the output of the 1st step int nbConcerned = Step2Mapper.nbConcerned(splits.length, numTrees, partition); TreeID[] fsKeys = new TreeID[nbConcerned]; Node[] fsTrees = new Node[nbConcerned]; FileSystem fs = forestPath.getFileSystem(job); int numInstances = InterResults.load(fs, forestPath, splits.length, numTrees, partition, fsKeys, fsTrees); Step2Mapper mapper = new Step2Mapper(); mapper.configure(partition, dataset, fsKeys, fsTrees, numInstances); long time = System.currentTimeMillis(); while (reader.next(key, value)) { mapper.map(key, value, secondOutput, reporter); } mapper.close(); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); }