List of usage examples for org.apache.hadoop.mapred JobConf JobConf
public JobConf(boolean loadDefaults)
From source file:com.hdfs.concat.crush.CrushReducerTest.java
License:Apache License
@Test public void missingOutputFormat() { JobConf job = new JobConf(false); job.set("mapred.tip.id", "task_201011081200_14527_r_1234"); job.set("fs.default.name", "file:///"); job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); job.set("mapred.output.dir", outDir.getAbsolutePath()); job.setLong("crush.timestamp", 98765); job.setLong("dfs.block.size", 1024 * 1024 * 64L); job.setInt("crush.num.specs", 2); job.set("crush.0.regex", "foo"); job.set("crush.0.regex.replacement", "bar"); job.set("crush.0.input.format", SequenceFileInputFormat.class.getName()); job.set("crush.0.output.format", SequenceFileOutputFormat.class.getName()); job.set("crush.1.regex", "hello"); job.set("crush.1.regex.replacement", "hello"); job.set("crush.1.input.format", SequenceFileInputFormat.class.getName()); reducer = new CrushReducer(); try {//from w ww .j a v a 2s . com reducer.configure(job); fail(); } catch (IllegalArgumentException e) { if (!"No output format: crush.1.output.format".equals(e.getMessage())) { throw e; } } }
From source file:com.hdfs.concat.crush.CrushReducerTest.java
License:Apache License
@Test public void outputFormatWrongType() { JobConf job = new JobConf(false); job.set("mapred.tip.id", "task_201011081200_14527_r_1234"); job.set("fs.default.name", "file:///"); job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); job.set("mapred.output.dir", outDir.getAbsolutePath()); job.setLong("crush.timestamp", 98765); job.setLong("dfs.block.size", 1024 * 1024 * 64L); job.setInt("crush.num.specs", 2); job.set("crush.0.regex", "foo"); job.set("crush.0.regex.replacement", "bar"); job.set("crush.0.input.format", SequenceFileInputFormat.class.getName()); job.set("crush.0.output.format", SequenceFileOutputFormat.class.getName()); job.set("crush.1.regex", "hello"); job.set("crush.1.regex.replacement", "hello"); job.set("crush.1.input.format", TextInputFormat.class.getName()); job.set("crush.1.output.format", Object.class.getName()); reducer = new CrushReducer(); try {/*from ww w . j a v a2 s . c o m*/ reducer.configure(job); fail(); } catch (IllegalArgumentException e) { if (!"Not an output format: crush.1.output.format=java.lang.Object".equals(e.getMessage())) { throw e; } } }
From source file:com.hdfs.concat.crush.CrushStandAloneSequenceFileTest.java
License:Apache License
@Before public void setup() throws Exception { job = new JobConf(false); job.set("fs.default.name", "file:///"); job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); job.setLong("dfs.block.size", 50); }
From source file:com.hdfs.concat.crush.CrushTest.java
License:Apache License
@Before public void setupJob() throws IOException { job = new JobConf(false); job.set("fs.default.name", "file:///"); job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); job.setInt("mapred.reduce.tasks", 5); job.setLong("dfs.block.size", 50); FileSystem delegate = FileSystem.get(job); fileSystem = new SortingFileSystem(delegate); /*// ww w .j a va 2 s .c o m * Set the working directory so that all relative paths are rooted in the tmp dir. This will keep the file system clean of * temporary test files. */ FileSystem.get(job).setWorkingDirectory(new Path(tmp.getRoot().getAbsolutePath())); }
From source file:com.ibm.bi.dml.api.DMLScript.java
License:Open Source License
/** * //from www .j a va 2s . c o m * @param config * @throws IOException * @throws ParseException */ private static void cleanupHadoopExecution(DMLConfig config) throws IOException, ParseException { //create dml-script-specific suffix StringBuilder sb = new StringBuilder(); sb.append(Lop.FILE_SEPARATOR); sb.append(Lop.PROCESS_PREFIX); sb.append(DMLScript.getUUID()); String dirSuffix = sb.toString(); //1) cleanup scratch space (everything for current uuid) //(required otherwise export to hdfs would skip assumed unnecessary writes if same name) MapReduceTool.deleteFileIfExistOnHDFS(config.getTextValue(DMLConfig.SCRATCH_SPACE) + dirSuffix); //2) cleanup hadoop working dirs (only required for LocalJobRunner (local job tracker), because //this implementation does not create job specific sub directories) JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); if (InfrastructureAnalyzer.isLocalMode(job)) { try { LocalFileUtils.deleteFileIfExists(DMLConfig.LOCAL_MR_MODE_STAGING_DIR + //staging dir (for local mode only) dirSuffix); LocalFileUtils.deleteFileIfExists(MRJobConfiguration.getLocalWorkingDirPrefix(job) + //local dir dirSuffix); MapReduceTool.deleteFileIfExistOnHDFS(MRJobConfiguration.getSystemWorkingDirPrefix(job) + //system dir dirSuffix); MapReduceTool.deleteFileIfExistOnHDFS(MRJobConfiguration.getStagingWorkingDirPrefix(job) + //staging dir dirSuffix); } catch (Exception ex) { //we give only a warning because those directories are written by the mapred deamon //and hence, execution can still succeed LOG.warn("Unable to cleanup hadoop working dirs: " + ex.getMessage()); } } //3) cleanup systemml-internal working dirs CacheableData.cleanupCacheDir(); //might be local/hdfs LocalFileUtils.cleanupWorkingDirectory(); }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerLocal.java
License:Open Source License
/** * /* w w w. j a v a2 s . c o m*/ * @param fname * @param fnameStaging * @param fnameNew * @param brlen * @param bclen * @throws DMLRuntimeException */ private void partitionTextCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { long row = -1; long col = -1; try { //STEP 1: read matrix from HDFS and write blocks to local staging area //check and add input path JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fname); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job); InputSplit[] splits = informat.getSplits(job, 1); LinkedList<Cell> buffer = new LinkedList<Cell>(); LongWritable key = new LongWritable(); Text value = new Text(); FastStringTokenizer st = new FastStringTokenizer(' '); for (InputSplit split : splits) { RecordReader<LongWritable, Text> reader = informat.getRecordReader(split, job, Reporter.NULL); try { while (reader.next(key, value)) { st.reset(value.toString()); //reset tokenizer row = st.nextLong(); col = st.nextLong(); double lvalue = st.nextDouble(); Cell tmp = new Cell(row, col, lvalue); buffer.addLast(tmp); if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush { appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen); buffer.clear(); } } //final flush if (!buffer.isEmpty()) { appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen); buffer.clear(); } } finally { if (reader != null) reader.close(); } } //STEP 2: read matrix blocks from staging area and write matrix to HDFS String[] fnamesPartitions = new File(fnameStaging).list(); if (PARALLEL) { int len = Math.min(fnamesPartitions.length, _par); Thread[] threads = new Thread[len]; for (int i = 0; i < len; i++) { int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len); int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1; end = Math.min(end, fnamesPartitions.length - 1); threads[i] = new Thread(new DataPartitionerWorkerTextCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end)); threads[i].start(); } for (Thread t : threads) t.join(); } else { for (String pdir : fnamesPartitions) writeTextCellFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir); } } catch (Exception e) { //post-mortem error handling and bounds checking if (row < 1 || row > rlen || col < 1 || col > clen) { throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } else throw new DMLRuntimeException("Unable to partition text cell matrix.", e); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerLocal.java
License:Open Source License
/** * /* ww w.jav a 2s . c om*/ * @param fname * @param fnameStaging * @param fnameNew * @param brlen * @param bclen * @throws DMLRuntimeException */ @SuppressWarnings("deprecation") private void partitionBinaryCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { long row = -1; long col = -1; try { //STEP 1: read matrix from HDFS and write blocks to local staging area //check and add input path JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fname); FileSystem fs = FileSystem.get(job); //prepare sequence file reader, and write to local staging area LinkedList<Cell> buffer = new LinkedList<Cell>(); MatrixIndexes key = new MatrixIndexes(); MatrixCell value = new MatrixCell(); for (Path lpath : MatrixReader.getSequenceFilePaths(fs, path)) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job); try { while (reader.next(key, value)) { row = key.getRowIndex(); col = key.getColumnIndex(); Cell tmp = new Cell(row, col, value.getValue()); buffer.addLast(tmp); if (buffer.size() > StagingFileUtils.CELL_BUFFER_SIZE) //periodic flush { appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen); buffer.clear(); } } //final flush if (!buffer.isEmpty()) { appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen); buffer.clear(); } } finally { if (reader != null) reader.close(); } } //STEP 2: read matrix blocks from staging area and write matrix to HDFS String[] fnamesPartitions = new File(fnameStaging).list(); if (PARALLEL) { int len = Math.min(fnamesPartitions.length, _par); Thread[] threads = new Thread[len]; for (int i = 0; i < len; i++) { int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len); int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1; end = Math.min(end, fnamesPartitions.length - 1); threads[i] = new Thread(new DataPartitionerWorkerBinaryCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end)); threads[i].start(); } for (Thread t : threads) t.join(); } else { for (String pdir : fnamesPartitions) writeBinaryCellSequenceFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir); } } catch (Exception e) { //post-mortem error handling and bounds checking if (row < 1 || row > rlen || col < 1 || col > clen) { throw new DMLRuntimeException("Matrix cell [" + (row) + "," + (col) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } else throw new DMLRuntimeException("Unable to partition binary cell matrix.", e); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerLocal.java
License:Open Source License
/** * // w w w.j a va 2 s . c o m * @param fname * @param fnameStaging * @param fnameNew * @param brlen * @param bclen * @throws DMLRuntimeException */ @SuppressWarnings("deprecation") private void partitionBinaryBlock(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { try { //create reuse object _reuseBlk = DataPartitioner.createReuseMatrixBlock(_format, brlen, bclen); //STEP 1: read matrix from HDFS and write blocks to local staging area //check and add input path JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fname); FileSystem fs = FileSystem.get(job); //prepare sequence file reader, and write to local staging area MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); for (Path lpath : MatrixReader.getSequenceFilePaths(fs, path)) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job); try { while (reader.next(key, value)) //for each block { long row_offset = (key.getRowIndex() - 1) * brlen; long col_offset = (key.getColumnIndex() - 1) * bclen; long rows = value.getNumRows(); long cols = value.getNumColumns(); //bound check per block if (row_offset + rows < 1 || row_offset + rows > rlen || col_offset + cols < 1 || col_offset + cols > clen) { throw new IOException("Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } appendBlockToStagingArea(fnameStaging, value, row_offset, col_offset, brlen, bclen); } } finally { if (reader != null) reader.close(); } } //STEP 2: read matrix blocks from staging area and write matrix to HDFS String[] fnamesPartitions = new File(fnameStaging).list(); if (PARALLEL) { int len = Math.min(fnamesPartitions.length, _par); Thread[] threads = new Thread[len]; for (int i = 0; i < len; i++) { int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len); int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1; end = Math.min(end, fnamesPartitions.length - 1); threads[i] = new Thread(new DataPartitionerWorkerBinaryBlock(job, fnameNew, fnameStaging, fnamesPartitions, start, end)); threads[i].start(); } for (Thread t : threads) t.join(); } else { for (String pdir : fnamesPartitions) writeBinaryBlockSequenceFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir, false); } } catch (Exception e) { throw new DMLRuntimeException("Unable to partition binary block matrix.", e); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerLocal.java
License:Open Source License
/** * /*from w w w . j a v a2 s. c o m*/ * @param fname * @param fnameStaging * @param fnameNew * @param brlen * @param bclen * @throws DMLRuntimeException */ @SuppressWarnings("deprecation") private void partitionBinaryBlock2BinaryCell(String fname, String fnameStaging, String fnameNew, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { try { //STEP 1: read matrix from HDFS and write blocks to local staging area //check and add input path JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fname); FileSystem fs = FileSystem.get(job); //prepare sequence file reader, and write to local staging area MatrixIndexes key = new MatrixIndexes(); MatrixBlock value = new MatrixBlock(); LinkedList<Cell> buffer = new LinkedList<Cell>(); for (Path lpath : MatrixReader.getSequenceFilePaths(fs, path)) { SequenceFile.Reader reader = new SequenceFile.Reader(fs, lpath, job); try { while (reader.next(key, value)) //for each block { long row_offset = (key.getRowIndex() - 1) * brlen; long col_offset = (key.getColumnIndex() - 1) * bclen; long rows = value.getNumRows(); long cols = value.getNumColumns(); //bound check per block if (row_offset + rows < 1 || row_offset + rows > rlen || col_offset + cols < 1 || col_offset + cols > clen) { throw new IOException("Matrix block [" + (row_offset + 1) + ":" + (row_offset + rows) + "," + (col_offset + 1) + ":" + (col_offset + cols) + "] " + "out of overall matrix range [1:" + rlen + ",1:" + clen + "]."); } boolean sparse = value.isInSparseFormat(); if (sparse) //SPARSE { SparseRowsIterator iter = value.getSparseRowsIterator(); while (iter.hasNext()) { IJV lcell = iter.next(); Cell tmp = new Cell(row_offset + lcell.i + 1, col_offset + lcell.j + 1, lcell.v); buffer.addLast(tmp); } } else //DENSE { for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) { double lvalue = value.getValueDenseUnsafe(i, j); if (lvalue != 0) //for nnz { Cell tmp = new Cell(row_offset + i + 1, col_offset + j + 1, lvalue); buffer.addLast(tmp); } } } appendCellBufferToStagingArea(fnameStaging, buffer, brlen, bclen); buffer.clear(); } } finally { if (reader != null) reader.close(); } } //STEP 2: read matrix blocks from staging area and write matrix to HDFS String[] fnamesPartitions = new File(fnameStaging).list(); if (PARALLEL) { int len = Math.min(fnamesPartitions.length, _par); Thread[] threads = new Thread[len]; for (int i = 0; i < len; i++) { int start = i * (int) Math.ceil(((double) fnamesPartitions.length) / len); int end = (i + 1) * (int) Math.ceil(((double) fnamesPartitions.length) / len) - 1; end = Math.min(end, fnamesPartitions.length - 1); threads[i] = new Thread(new DataPartitionerWorkerBinaryCell(job, fnameNew, fnameStaging, fnamesPartitions, start, end)); threads[i].start(); } for (Thread t : threads) t.join(); } else { for (String pdir : fnamesPartitions) writeBinaryCellSequenceFileToHDFS(job, fnameNew, fnameStaging + "/" + pdir); } } catch (Exception e) { throw new DMLRuntimeException("Unable to partition binary block matrix.", e); } }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.DataPartitionerRemoteMR.java
License:Open Source License
@Override protected void partitionMatrix(MatrixObject in, String fnameNew, InputInfo ii, OutputInfo oi, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { String jobname = "ParFor-DPMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job;/*w w w . j a v a2 s.c o m*/ job = new JobConf(DataPartitionerRemoteMR.class); if (_pfid >= 0) //use in parfor job.setJobName(jobname + _pfid); else //use for partition instruction job.setJobName("Partition-MR"); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); try { //force writing to disk (typically not required since partitioning only applied if dataset exceeds CP size) in.exportData(); //written to disk iff dirty Path path = new Path(in.getFileName()); ///// //configure the MR job MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, ii, oi, _format, _n, fnameNew, _keepIndexes); //set mappers, reducers, combiners job.setMapperClass(DataPartitionerRemoteMapper.class); job.setReducerClass(DataPartitionerRemoteReducer.class); if (oi == OutputInfo.TextCellOutputInfo) { //binary cell intermediates for reduced IO job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(PairWritableCell.class); } else if (oi == OutputInfo.BinaryCellOutputInfo) { job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(PairWritableCell.class); } else if (oi == OutputInfo.BinaryBlockOutputInfo) { job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(PairWritableBlock.class); //check Alignment if ((_format == PDataPartitionFormat.ROW_BLOCK_WISE_N && rlen > _n && _n % brlen != 0) || (_format == PDataPartitionFormat.COLUMN_BLOCK_WISE_N && clen > _n && _n % bclen != 0)) { throw new DMLRuntimeException( "Data partitioning format " + _format + " requires aligned blocks."); } } //set input format job.setInputFormat(ii.inputFormatClass); //set the input path and output path FileInputFormat.setInputPaths(job, path); //set output path MapReduceTool.deleteFileIfExistOnHDFS(fnameNew); //FileOutputFormat.setOutputPath(job, pathNew); job.setOutputFormat(NullOutputFormat.class); ////// //set optimization parameters //set the number of mappers and reducers //job.setNumMapTasks( _numMappers ); //use default num mappers long reducerGroups = -1; switch (_format) { case ROW_WISE: reducerGroups = rlen; break; case COLUMN_WISE: reducerGroups = clen; break; case ROW_BLOCK_WISE: reducerGroups = (rlen / brlen) + ((rlen % brlen == 0) ? 0 : 1); break; case COLUMN_BLOCK_WISE: reducerGroups = (clen / bclen) + ((clen % bclen == 0) ? 0 : 1); break; case ROW_BLOCK_WISE_N: reducerGroups = (rlen / _n) + ((rlen % _n == 0) ? 0 : 1); break; case COLUMN_BLOCK_WISE_N: reducerGroups = (clen / _n) + ((clen % _n == 0) ? 0 : 1); break; default: //do nothing } job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups)); //use FLEX scheduler configuration properties /*if( ParForProgramBlock.USE_FLEX_SCHEDULER_CONF ) { job.setInt("flex.map.min", 0); job.setInt("flex.map.max", _numMappers); job.setInt("flex.reduce.min", 0); job.setInt("flex.reduce.max", _numMappers); }*/ //disable automatic tasks timeouts and speculative task exec job.setInt("mapred.task.timeout", 0); job.setMapSpeculativeExecution(false); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //enables the reuse of JVMs (multiple tasks per MR task) if (_jvmReuse) job.setNumTasksToExecutePerJvm(-1); //unlimited //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower) //job.set("mapred.compress.map.output", "true"); //job.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); //set the replication factor for the results job.setInt("dfs.replication", _replication); //set up map/reduce memory configurations (if in AM context) DMLConfig config = ConfigurationManager.getConfig(); DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config); //set the max number of retries per map task // disabled job-level configuration to respect cluster configuration // note: this refers to hadoop2, hence it never had effect on mr1 //job.setInt("mapreduce.map.maxattempts", _max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job JobClient.runJob(job); //maintain dml script counters Statistics.incrementNoOfExecutedMRJobs(); } catch (Exception ex) { throw new DMLRuntimeException(ex); } if (DMLScript.STATISTICS && _pfid >= 0) { long t1 = System.nanoTime(); //only for parfor Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } }