List of usage examples for org.apache.hadoop.mapreduce Job Job
Job(JobConf conf) throws IOException
From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableInputFormat.java
License:Apache License
private InputSplit[] getSplitsInternal(JobConf jobConf, int numSplits) throws IOException { //obtain delegation tokens for the job if (UserGroupInformation.getCurrentUser().hasKerberosCredentials()) { TableMapReduceUtil.initCredentials(jobConf); }//from www. j ava2 s .c om String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME); String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING); boolean doColumnRegexMatching = jobConf.getBoolean(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, true); if (hbaseColumnsMapping == null) { throw new IOException(HBaseSerDe.HBASE_COLUMNS_MAPPING + " required for HBase Table."); } ColumnMappings columnMappings = null; int iTimeColumn = -1; try { columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColumnsMapping, doColumnRegexMatching); iTimeColumn = HBaseSerDe.getTxTimeColumnIndex(columnMappings, jobConf); } catch (SerDeException e) { throw new IOException(e); } int iKey = columnMappings.getKeyIndex(); int iTimestamp = columnMappings.getTimestampIndex(); ColumnMapping keyMapping = columnMappings.getKeyMapping(); if (iTimeColumn != -1) { List<org.apache.hadoop.mapreduce.InputSplit> splits = TxHiveTableInputFormatUtil.getSplits(jobConf, numSplits, columnMappings, iTimeColumn, hbaseTableName); if (splits != null) { Job job = new Job(jobConf); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); InputSplit[] results = new InputSplit[splits.size()]; for (int i = 0; i < splits.size(); i++) { results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0], true); } LOG.info("getSplits: TxHiveIndexScan"); return results; } } LOG.info("getSplits: no TxHiveIndexScan"); setHTable(new HTable(HBaseConfiguration.create(jobConf), Bytes.toBytes(hbaseTableName))); // Take filter pushdown into account while calculating splits; this // allows us to prune off regions immediately. Note that although // the Javadoc for the superclass getSplits says that it returns one // split per region, the implementation actually takes the scan // definition into account and excludes regions which don't satisfy // the start/stop row conditions (HBASE-1829). Scan scan = createFilterScan(jobConf, iKey, iTimestamp, HiveHBaseInputFormatUtil.getStorageFormatOfKey( keyMapping.mappingSpec, jobConf.get(HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE, "string"))); // The list of families that have been added to the scan List<String> addedFamilies = new ArrayList<String>(); // REVIEW: are we supposed to be applying the getReadColumnIDs // same as in getRecordReader? for (ColumnMapping colMap : columnMappings) { if (colMap.hbaseRowKey || colMap.hbaseTimestamp) { continue; } if (colMap.qualifierName == null) { scan.addFamily(colMap.familyNameBytes); addedFamilies.add(colMap.familyName); } else { if (!addedFamilies.contains(colMap.familyName)) { // add the column only if the family has not already been added scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes); } } } setScan(scan); Job job = new Job(jobConf); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext); InputSplit[] results = new InputSplit[splits.size()]; for (int i = 0; i < splits.size(); i++) { results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]); } return results; }
From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableOutputFormat.java
License:Apache License
/** * Update the out table, and output an empty key as the key. * * @param jc the job configuration file/*from ww w.j a v a 2 s. c om*/ * @param finalOutPath the final output table name * @param valueClass the value class * @param isCompressed whether the content is compressed or not * @param tableProperties the table info of the corresponding table * @param progress progress used for status report * @return the RecordWriter for the output file */ @Override public void checkOutputSpecs(FileSystem fs, JobConf jc) throws IOException { //obtain delegation tokens for the job if (UserGroupInformation.getCurrentUser().hasKerberosCredentials()) { TableMapReduceUtil.initCredentials(jc); } String hbaseTableName = jc.get(HBaseSerDe.HBASE_TABLE_NAME); jc.set(TableOutputFormat.OUTPUT_TABLE, hbaseTableName); Job job = new Job(jc); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); try { checkOutputSpecs(jobContext); } catch (InterruptedException e) { throw new IOException(e); } }
From source file:com.github.dryangkun.hbase.tidx.hive.HiveHFileOutputFormat.java
License:Apache License
@Override public RecordWriter getHiveRecordWriter(final JobConf jc, final Path finalOutPath, Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties, final Progressable progressable) throws IOException { // Read configuration for the target path, first from jobconf, then from table properties String hfilePath = getFamilyPath(jc, tableProperties); if (hfilePath == null) { throw new RuntimeException("Please set " + HFILE_FAMILY_PATH + " to target location for HFiles"); }/*from ww w .j a v a 2s .c o m*/ // Target path's last component is also the column family name. final Path columnFamilyPath = new Path(hfilePath); final String columnFamilyName = columnFamilyPath.getName(); final byte[] columnFamilyNameBytes = Bytes.toBytes(columnFamilyName); final Job job = new Job(jc); setCompressOutput(job, isCompressed); setOutputPath(job, finalOutPath); // Create the HFile writer final org.apache.hadoop.mapreduce.TaskAttemptContext tac = ShimLoader.getHadoopShims() .newTaskAttemptContext(job.getConfiguration(), progressable); final Path outputdir = FileOutputFormat.getOutputPath(tac); final org.apache.hadoop.mapreduce.RecordWriter<ImmutableBytesWritable, KeyValue> fileWriter = getFileWriter( tac); // Individual columns are going to be pivoted to HBase cells, // and for each row, they need to be written out in order // of column name, so sort the column names now, creating a // mapping to their column position. However, the first // column is interpreted as the row key. String columnList = tableProperties.getProperty("columns"); String[] columnArray = columnList.split(","); final SortedMap<byte[], Integer> columnMap = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR); int i = 0; for (String columnName : columnArray) { if (i != 0) { columnMap.put(Bytes.toBytes(columnName), i); } ++i; } return new RecordWriter() { @Override public void close(boolean abort) throws IOException { try { fileWriter.close(null); if (abort) { return; } // Move the hfiles file(s) from the task output directory to the // location specified by the user. FileSystem fs = outputdir.getFileSystem(jc); fs.mkdirs(columnFamilyPath); Path srcDir = outputdir; for (;;) { FileStatus[] files = fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER); if ((files == null) || (files.length == 0)) { throw new IOException("No family directories found in " + srcDir); } if (files.length != 1) { throw new IOException("Multiple family directories found in " + srcDir); } srcDir = files[0].getPath(); if (srcDir.getName().equals(columnFamilyName)) { break; } } for (FileStatus regionFile : fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER)) { fs.rename(regionFile.getPath(), new Path(columnFamilyPath, regionFile.getPath().getName())); } // Hive actually wants a file as task output (not a directory), so // replace the empty directory with an empty file to keep it happy. fs.delete(outputdir, true); fs.createNewFile(outputdir); } catch (InterruptedException ex) { throw new IOException(ex); } } private void writeText(Text text) throws IOException { // Decompose the incoming text row into fields. String s = text.toString(); String[] fields = s.split("\u0001"); assert (fields.length <= (columnMap.size() + 1)); // First field is the row key. byte[] rowKeyBytes = Bytes.toBytes(fields[0]); // Remaining fields are cells addressed by column name within row. for (Map.Entry<byte[], Integer> entry : columnMap.entrySet()) { byte[] columnNameBytes = entry.getKey(); int iColumn = entry.getValue(); String val; if (iColumn >= fields.length) { // trailing blank field val = ""; } else { val = fields[iColumn]; if ("\\N".equals(val)) { // omit nulls continue; } } byte[] valBytes = Bytes.toBytes(val); KeyValue kv = new KeyValue(rowKeyBytes, columnFamilyNameBytes, columnNameBytes, valBytes); try { fileWriter.write(null, kv); } catch (IOException e) { LOG.error("Failed while writing row: " + s); throw e; } catch (InterruptedException ex) { throw new IOException(ex); } } } private void writePut(PutWritable put) throws IOException { ImmutableBytesWritable row = new ImmutableBytesWritable(put.getPut().getRow()); SortedMap<byte[], List<Cell>> cells = put.getPut().getFamilyCellMap(); for (Map.Entry<byte[], List<Cell>> entry : cells.entrySet()) { Collections.sort(entry.getValue(), new CellComparator()); for (Cell c : entry.getValue()) { try { fileWriter.write(row, KeyValueUtil.copyToNewKeyValue(c)); } catch (InterruptedException e) { throw (InterruptedIOException) new InterruptedIOException().initCause(e); } } } } @Override public void write(Writable w) throws IOException { if (w instanceof Text) { writeText((Text) w); } else if (w instanceof PutWritable) { writePut((PutWritable) w); } else { throw new IOException("Unexpected writable " + w); } } }; }
From source file:com.github.dryangkun.hbase.tidx.hive.HiveHFileOutputFormat.java
License:Apache License
@Override public void checkOutputSpecs(FileSystem ignored, JobConf jc) throws IOException { //delegate to the new api Job job = new Job(jc); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); checkOutputSpecs(jobContext);//from w w w . j a v a 2 s . c o m }
From source file:com.github.seqware.queryengine.plugins.runners.inmemory.InMemoryPluginRunner.java
License:Open Source License
public InMemoryPluginRunner(PluginInterface pluginInterface, Reference reference, List<FeatureSet> inputSet, Object[] parameters) {/* ww w.j a va2 s. com*/ if (reference != null) { throw new UnsupportedOperationException(); } this.pluginInterface = pluginInterface; CreateUpdateManager manager = SWQEFactory.getModelManager(); //outputSet should attach to the original reference FeatureSet outputSet = manager.buildFeatureSet().setReferenceID(inputSet.iterator().next().getReferenceID()) .build(); manager.close(); byte[][] sSet = new byte[inputSet.size()][];//SWQEFactory.getSerialization().serialize(inputSet); for (int i = 0; i < sSet.length; i++) { sSet[i] = SWQEFactory.getSerialization().serialize(inputSet.get(i)); } byte[] dSet = SWQEFactory.getSerialization().serialize(outputSet); // pretend to serialize parameters this.serializedParameters = MRHBasePluginRunner.serializeParametersToString(parameters, pluginInterface, sSet, dSet); // pretend to de-serialize Configuration conf = new Configuration(); String[] str_params = serializeParametersToString(parameters, pluginInterface, sSet, dSet); conf.setStrings(EXT_PARAMETERS, str_params); Job job = null; try { job = new Job(conf); } catch (IOException ex) { Rethrow.rethrow(ex); } Class plugin = MRHBasePluginRunner.transferConfiguration(job, this); // this is not currently asynchronous if (pluginInterface instanceof MapReducePlugin) { MapReducePlugin mrPlugin = null; try { mrPlugin = (MapReducePlugin) plugin.newInstance(); } catch (InstantiationException ex) { Rethrow.rethrow(ex); } catch (IllegalAccessException ex) { Rethrow.rethrow(ex); } mrPlugin.mapInit(this); Map<Long, Map<FeatureSet, Collection<Feature>>> map = new HashMap<Long, Map<FeatureSet, Collection<Feature>>>(); for (FeatureSet set : inputSet) { for (Feature feature : set) { for (long i = feature.getStart(); i <= feature.getStop(); i++) { if (!map.containsKey(i)) { map.put(i, new HashMap<FeatureSet, Collection<Feature>>()); } if (!map.get(i).containsKey(set)) { map.get(i).put(set, new ArrayList<Feature>()); } map.get(i).get(set).add(feature); } } } // mimic filtering for (Entry<Long, Map<FeatureSet, Collection<Feature>>> e : map.entrySet()) { Map<FeatureSet, Collection<Feature>> innerMap = MRHBasePluginRunner .handlePreFilteredPlugins(e.getValue(), mrPlugin, this.ext_parameters); // not sure what to do for position here mrPlugin.map(e.getKey(), innerMap, this); mrPlugin.mapCleanup(); } mrPlugin.reduceInit(); // TODO: make this pass through functional in order to simulate MapReduce for (Feature f : inputSet.iterator().next()) { mrPlugin.reduce(null, null, this); } mrPlugin.reduceCleanup(); mrPlugin.cleanup(); } else { throw new UnsupportedOperationException("Scan plugins not supported yet"); } }
From source file:com.gsinnovations.howdah.AbstractJob.java
License:Apache License
protected Job prepareJob(Path inputPath, Path outputPath, Class<? extends InputFormat> inputFormat, Class<? extends Mapper> mapper, Class<? extends Writable> mapperKey, Class<? extends Writable> mapperValue, Class<? extends Reducer> reducer, Class<? extends Writable> reducerKey, Class<? extends Writable> reducerValue, Class<? extends OutputFormat> outputFormat) throws IOException { Job job = new Job(new Configuration(getConf())); Configuration jobConf = job.getConfiguration(); if (reducer.equals(Reducer.class)) { if (mapper.equals(Mapper.class)) { throw new IllegalStateException("Can't figure out the user class jar file from mapper/reducer"); }/* ww w. ja va 2s . c o m*/ job.setJarByClass(mapper); } else { job.setJarByClass(reducer); } job.setInputFormatClass(inputFormat); jobConf.set("mapred.input.dir", inputPath.toString()); job.setMapperClass(mapper); job.setMapOutputKeyClass(mapperKey); job.setMapOutputValueClass(mapperValue); jobConf.setBoolean("mapred.compress.map.output", true); job.setReducerClass(reducer); job.setOutputKeyClass(reducerKey); job.setOutputValueClass(reducerValue); job.setJobName(getCustomJobName(job, mapper, reducer)); job.setOutputFormatClass(outputFormat); jobConf.set("mapred.output.dir", outputPath.toString()); return job; }
From source file:com.gsinnovations.howdah.Driver.java
License:Apache License
public static void job(Path input, Path output, int numReduceTasks) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = new Job(conf); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(TikaMapper.class); //job.setCombinerClass(KMeansCombiner.class); //job.setReducerClass(KMeansReducer.class); job.setNumReduceTasks(numReduceTasks); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); job.setJarByClass(Driver.class); HadoopUtil.overwriteOutput(output);//from ww w . ja v a 2 s .c o m job.waitForCompletion(true); }
From source file:com.gsvic.csmr.CSMRBase.java
License:Apache License
public static void generatePairs(String in, String out) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); path = out;/* w ww . j a v a2 s . co m*/ Job job; Path input, output; input = new Path(in); output = new Path(path + "/CSMRPairs"); job = new Job(conf); job.setJobName("CSMR Pairs Job"); job.setJarByClass(CSMRBase.class); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); job.setMapperClass(CSMRMapper.class); job.setReducerClass(CSMRReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(DocumentWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorArrayWritable.class); job.waitForCompletion(true); }
From source file:com.gsvic.csmr.CSMRBase.java
License:Apache License
public static void StartCSMR() throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); Job job;/*w w w.j a v a 2s. c o m*/ job = new Job(conf); job.setJobName("CSMR Cosine Similarity Job"); job.setJarByClass(CSMRBase.class); FileInputFormat.addInputPath(job, new Path(path + "/CSMRPairs/part-r-00000")); FileOutputFormat.setOutputPath(job, new Path(path + "/Results")); job.setMapperClass(Mapper.class); job.setReducerClass(CosineSimilarityReducer.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorArrayWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); System.exit(job.waitForCompletion(true) ? 1 : 0); }
From source file:com.hadoop.mapreduce.TestLzoTextInputFormat.java
License:Open Source License
/** * Generate random data, compress it, index and md5 hash the data. * Then read it all back and md5 that too, to verify that it all went ok. * /*w w w .jav a2 s .c o m*/ * @param testWithIndex Should we index or not? * @param charsToOutput How many characters of random data should we output. * @throws IOException * @throws NoSuchAlgorithmException * @throws InterruptedException */ private void runTest(boolean testWithIndex, int charsToOutput) throws IOException, NoSuchAlgorithmException, InterruptedException { if (!GPLNativeCodeLoader.isNativeCodeLoaded()) { LOG.warn("Cannot run this test without the native lzo libraries"); return; } Configuration conf = new Configuration(); conf.setLong("fs.local.block.size", charsToOutput / 2); // reducing block size to force a split of the tiny file conf.set("io.compression.codecs", LzopCodec.class.getName()); FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(outputDir, true); localFs.mkdirs(outputDir); Job job = new Job(conf); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class); TextOutputFormat.setOutputPath(job, outputDir); TaskAttemptContext attemptContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID("123", 0, TaskType.REDUCE, 1, 2)); // create some input data byte[] expectedMd5 = createTestInput(outputDir, localFs, attemptContext, charsToOutput); if (testWithIndex) { Path lzoFile = new Path(outputDir, lzoFileName); LzoTextInputFormat.createIndex(localFs, lzoFile); } LzoTextInputFormat inputFormat = new LzoTextInputFormat(); TextInputFormat.setInputPaths(job, outputDir); List<InputSplit> is = inputFormat.getSplits(job); //verify we have the right number of lzo chunks if (testWithIndex && OUTPUT_BIG == charsToOutput) { assertEquals(3, is.size()); } else { assertEquals(1, is.size()); } // let's read it all and calculate the md5 hash for (InputSplit inputSplit : is) { RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(inputSplit, attemptContext); rr.initialize(inputSplit, attemptContext); while (rr.nextKeyValue()) { Text value = rr.getCurrentValue(); md5.update(value.getBytes(), 0, value.getLength()); } rr.close(); } localFs.close(); assertTrue(Arrays.equals(expectedMd5, md5.digest())); }