List of usage examples for org.apache.hadoop.mapreduce TaskAttemptID TaskAttemptID
public TaskAttemptID()
From source file:be.uantwerpen.adrem.bigfim.ComputeTidListReducerTest.java
License:Apache License
@Test public void One_PG_One_Item() throws Exception { MultipleOutputs<IntArrayWritable, IntMatrixWritable> mos = createMock(MultipleOutputs.class); mos.write(newIAW(1), EmptyImw, "pg/bucket-0"); mos.write(newIAW(0), new IntMatrixWritable(newIAW(0, 1, 2, 4, 7, 9), newIAW(0, 1, 2, 3, 5, 6, 8)), "pg/bucket-0"); mos.write(EmptyIaw, EmptyImw, "pg/bucket-0"); mos.close();/*from w w w. j a va 2s .c o m*/ Reducer.Context ctx = createMock(Reducer.Context.class); EasyMock.expect(ctx.getConfiguration()).andReturn(createConfiguration()).anyTimes(); EasyMock.expect(ctx.getTaskAttemptID()).andReturn(new TaskAttemptID()).anyTimes(); EasyMock.replay(ctx, mos); ComputeTidListReducer reducer = new ComputeTidListReducer(); reducer.setup(ctx); setField(reducer, "mos", mos); reducer.reduce(new Text("1"), createTestInput_1Item(), ctx); reducer.cleanup(ctx); EasyMock.verify(mos); }
From source file:be.uantwerpen.adrem.bigfim.ComputeTidListReducerTest.java
License:Apache License
@Test public void One_PG_N_Items() throws Exception { MultipleOutputs<IntArrayWritable, IntMatrixWritable> mos = createMock(MultipleOutputs.class); mos.write(newIAW(1), EmptyImw, "pg/bucket-0"); mos.write(newIAW(0), new IntMatrixWritable(newIAW(0, 1, 2, 4, 7, 9), newIAW(0, 1, 2, 3, 5, 6, 8)), "pg/bucket-0"); mos.write(newIAW(1), new IntMatrixWritable(newIAW(1, 2, 3), newIAW(4, 5, 6)), "pg/bucket-0"); mos.write(newIAW(3), new IntMatrixWritable(newIAW(4, 7, 9), newIAW(4, 7, 9)), "pg/bucket-0"); mos.write(EmptyIaw, EmptyImw, "pg/bucket-0"); mos.close();/* w w w . j ava 2s . c om*/ Reducer.Context ctx = createMock(Reducer.Context.class); EasyMock.expect(ctx.getConfiguration()).andReturn(createConfiguration()).anyTimes(); EasyMock.expect(ctx.getTaskAttemptID()).andReturn(new TaskAttemptID()).anyTimes(); EasyMock.replay(ctx, mos); ComputeTidListReducer reducer = new ComputeTidListReducer(); reducer.setup(ctx); setField(reducer, "mos", mos); reducer.reduce(new Text("1"), createTestInput_NItems(), ctx); reducer.cleanup(ctx); EasyMock.verify(mos); }
From source file:be.uantwerpen.adrem.bigfim.ComputeTidListReducerTest.java
License:Apache License
@Test public void N_PG_N_Items() throws Exception { MultipleOutputs<IntArrayWritable, IntMatrixWritable> mos = createMock(MultipleOutputs.class); mos.write(newIAW(1), EmptyImw, "pg/bucket-0"); mos.write(newIAW(0), new IntMatrixWritable(newIAW(0, 1, 2, 4, 7, 9), newIAW(0, 1, 2, 3, 5, 6, 8)), "pg/bucket-0"); mos.write(newIAW(1), new IntMatrixWritable(newIAW(1, 2, 3), newIAW(4, 5, 6)), "pg/bucket-0"); mos.write(newIAW(3), new IntMatrixWritable(newIAW(4, 7, 9), newIAW(4, 7, 9)), "pg/bucket-0"); mos.write(EmptyIaw, EmptyImw, "pg/bucket-0"); mos.write(newIAW(2), EmptyImw, "pg/bucket-1"); mos.write(newIAW(1), new IntMatrixWritable(newIAW(1, 4, 7, 8), newIAW(1, 5, 6, 8)), "pg/bucket-1"); mos.write(newIAW(2), new IntMatrixWritable(newIAW(3, 5, 7), newIAW(1, 2, 3, 4, 5, 6, 7, 8, 9)), "pg/bucket-1"); mos.write(EmptyIaw, EmptyImw, "pg/bucket-1"); mos.close();/*w w w .j a v a2 s . c o m*/ Reducer.Context ctx = createMock(Reducer.Context.class); EasyMock.expect(ctx.getConfiguration()).andReturn(createConfiguration()).anyTimes(); EasyMock.expect(ctx.getTaskAttemptID()).andReturn(new TaskAttemptID()).anyTimes(); EasyMock.replay(ctx, mos); ComputeTidListReducer reducer = new ComputeTidListReducer(); reducer.setup(ctx); setField(reducer, "mos", mos); reducer.reduce(new Text("1"), createTestInput_NItems(), ctx); reducer.reduce(new Text("2"), createTestInput_NItems2(), ctx); reducer.cleanup(ctx); EasyMock.verify(mos); }
From source file:com.asakusafw.lang.compiler.mapreduce.testing.InputFormatTester.java
License:Apache License
/** * Collects input contents.//from www . j a v a2 s . co m * @param <T> the data type * @param collector the target collector * @throws IOException if failed * @throws InterruptedException if interrupted */ @SuppressWarnings("unchecked") public <T> void collect(Consumer<T> collector) throws IOException, InterruptedException { TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); List<InputSplit> splits = format.getSplits(context); for (InputSplit split : splits) { InputSplit restored = restore(split); try (RecordReader<?, ?> reader = format.createRecordReader(restored, context)) { reader.initialize(restored, context); while (reader.nextKeyValue()) { collector.accept((T) reader.getCurrentValue()); } } } }
From source file:com.ask.hive.hbase.HiveHBaseTextTableInputFormat.java
License:Apache License
public RecordReader<Text, Text> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException { HBaseSplit hbaseSplit = (HBaseSplit) split; TableSplit tableSplit = hbaseSplit.getSplit(); String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME); setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName))); String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING); List<String> hbaseColumnFamilies = new ArrayList<String>(); List<String> hbaseColumnQualifiers = new ArrayList<String>(); List<byte[]> hbaseColumnFamiliesBytes = new ArrayList<byte[]>(); List<byte[]> hbaseColumnQualifiersBytes = new ArrayList<byte[]>(); int iKey;/* ww w.j a v a 2 s . c o m*/ try { iKey = parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes, hbaseColumnQualifiers, hbaseColumnQualifiersBytes); } catch (Exception se) { throw new IOException(se); } List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf); if (hbaseColumnFamilies.size() < readColIDs.size()) { throw new IOException("Cannot read more columns than the given table contains."); } boolean addAll = (readColIDs.size() == 0); Scan scan = new Scan(); boolean empty = true; if (!addAll) { for (int i : readColIDs) { if (i == iKey) { continue; } scan.addFamily(hbaseColumnFamiliesBytes.get(i)); empty = false; } } // The HBase table's row key maps to a Hive table column. In the corner case when only the // row key column is selected in Hive, the HBase Scan will be empty i.e. no column family/ // column qualifier will have been added to the scan. We arbitrarily add at least one column // to the HBase scan so that we can retrieve all of the row keys and return them as the Hive // tables column projection. if (empty) { for (int i = 0; i < hbaseColumnFamilies.size(); i++) { if (i == iKey) { continue; } if (hbaseColumnQualifiers.get(i) == null) { scan.addFamily(hbaseColumnFamiliesBytes.get(i)); } else { scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i)); } if (!addAll) { break; } } } //setting start and end time for scanning setTime(jobConf, scan); // If Hive's optimizer gave us a filter to process, convert it to the // HBase scan form now. tableSplit = convertFilter(jobConf, scan, tableSplit, iKey); setScan(scan); Job job = new Job(jobConf); TaskAttemptContext tac = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) { @Override public void progress() { reporter.progress(); } }; final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result> recordReader = createRecordReader( tableSplit, tac); return new RecordReader<Text, Text>() { //@Override public void close() throws IOException { recordReader.close(); } // @Override public Text createKey() { return new Text(); } // @Override public Text createValue() { return new Text(); } // @Override public long getPos() throws IOException { return 0; } // @Override public float getProgress() throws IOException { float progress = 0.0F; try { progress = recordReader.getProgress(); } catch (InterruptedException e) { throw new IOException(e); } return progress; } // @Override public boolean next(Text rowKey, Text value) throws IOException { boolean next = false; try { next = recordReader.nextKeyValue(); //logic for to find the column name if (next) { rowKey.set(Bytes.toString(recordReader.getCurrentValue().getRow())); StringBuilder val = new StringBuilder(); String prev = ""; for (KeyValue kv : recordReader.getCurrentValue().raw()) { String current = new String(kv.getQualifier()); char[] col = new String(current).toCharArray(); if (val.length() > 0) { if (prev.equals(current)) val.append(","); else val.append("\t"); } prev = current; val.append(col[0]).append("_"); val.append(Bytes.toString(kv.getValue())); } value.set(val.toString()); // rowKey.set(Bytes.toString(recordReader.getCurrentValue().getRow()));; // value.set(Bytes.toString(recordReader.getCurrentValue().value())); } } catch (InterruptedException e) { throw new IOException(e); } return next; } }; }
From source file:com.ask.hive.hbase.HiveHBaseTimeTableInputFormat.java
License:Apache License
public RecordReader<ImmutableBytesWritable, Result> getRecordReader(InputSplit split, JobConf jobConf, final Reporter reporter) throws IOException { HBaseSplit hbaseSplit = (HBaseSplit) split; TableSplit tableSplit = hbaseSplit.getSplit(); String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME); setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName))); String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING); List<String> hbaseColumnFamilies = new ArrayList<String>(); List<String> hbaseColumnQualifiers = new ArrayList<String>(); List<byte[]> hbaseColumnFamiliesBytes = new ArrayList<byte[]>(); List<byte[]> hbaseColumnQualifiersBytes = new ArrayList<byte[]>(); int iKey;/* w w w . j a v a2s . co m*/ try { iKey = HBaseSerDe.parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes, hbaseColumnQualifiers, hbaseColumnQualifiersBytes); } catch (SerDeException se) { throw new IOException(se); } List<Integer> readColIDs = ColumnProjectionUtils.getReadColumnIDs(jobConf); if (hbaseColumnFamilies.size() < readColIDs.size()) { throw new IOException("Cannot read more columns than the given table contains."); } boolean addAll = (readColIDs.size() == 0); Scan scan = new Scan(); boolean empty = true; if (!addAll) { for (int i : readColIDs) { if (i == iKey) { continue; } if (hbaseColumnQualifiers.get(i) == null) { scan.addFamily(hbaseColumnFamiliesBytes.get(i)); } else { scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i)); } empty = false; } } // The HBase table's row key maps to a Hive table column. In the corner case when only the // row key column is selected in Hive, the HBase Scan will be empty i.e. no column family/ // column qualifier will have been added to the scan. We arbitrarily add at least one column // to the HBase scan so that we can retrieve all of the row keys and return them as the Hive // tables column projection. if (empty) { for (int i = 0; i < hbaseColumnFamilies.size(); i++) { if (i == iKey) { continue; } if (hbaseColumnQualifiers.get(i) == null) { scan.addFamily(hbaseColumnFamiliesBytes.get(i)); } else { scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i)); } if (!addAll) { break; } } } //setting start and end time for scanning setTime(jobConf, scan); // If Hive's optimizer gave us a filter to process, convert it to the // HBase scan form now. tableSplit = convertFilter(jobConf, scan, tableSplit, iKey); setScan(scan); Job job = new Job(jobConf); TaskAttemptContext tac = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()) { @Override public void progress() { reporter.progress(); } }; final org.apache.hadoop.mapreduce.RecordReader<ImmutableBytesWritable, Result> recordReader = createRecordReader( tableSplit, tac); return new RecordReader<ImmutableBytesWritable, Result>() { //@Override public void close() throws IOException { recordReader.close(); } // @Override public ImmutableBytesWritable createKey() { return new ImmutableBytesWritable(); } // @Override public Result createValue() { return new Result(); } // @Override public long getPos() throws IOException { return 0; } // @Override public float getProgress() throws IOException { float progress = 0.0F; try { progress = recordReader.getProgress(); } catch (InterruptedException e) { throw new IOException(e); } return progress; } // @Override public boolean next(ImmutableBytesWritable rowKey, Result value) throws IOException { boolean next = false; try { next = recordReader.nextKeyValue(); if (next) { rowKey.set(recordReader.getCurrentValue().getRow()); Writables.copyWritable(recordReader.getCurrentValue(), value); } } catch (InterruptedException e) { throw new IOException(e); } return next; } }; }
From source file:com.clojurewerkz.cascading.cassandra.hadoop.ColumnFamilyInputFormat.java
License:Apache License
public org.apache.hadoop.mapred.InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException { TaskAttemptContext tac = new TaskAttemptContext(jobConf, new TaskAttemptID()); List<org.apache.hadoop.mapreduce.InputSplit> newInputSplits = this.getSplits(tac); org.apache.hadoop.mapred.InputSplit[] oldInputSplits = new org.apache.hadoop.mapred.InputSplit[newInputSplits .size()];/*from w ww. j a v a 2 s.c o m*/ for (int i = 0; i < newInputSplits.size(); i++) oldInputSplits[i] = (ColumnFamilySplit) newInputSplits.get(i); return oldInputSplits; }
From source file:com.cloudera.integration.oracle.goldengate.ldv.mapreduce.lib.input.LengthDelimitedInputFormatTest.java
@Test public void test() throws IOException, InterruptedException { Configuration conf = new Configuration(false); conf.set("fs.default.name", "file:///"); conf.setInt(Constants.RECORD_PREFIX_LENGTH, 4); conf.setInt(Constants.FIELD_PREFIX_LENGTH, 4); Path path = new Path(tempFile.getAbsoluteFile().toURI()); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); LengthDelimitedInputFormat inputFormat = ReflectionUtils.newInstance(LengthDelimitedInputFormat.class, conf);/*from w w w .j a va2 s. co m*/ try (LengthDelimitedRecordReader reader = (LengthDelimitedRecordReader) inputFormat.createRecordReader(null, context)) { FileSplit split = new FileSplit(path, 0, tempFile.length(), null); reader.initialize(split, context); while (reader.nextKeyValue()) { LengthDelimitedWritable writable = reader.getCurrentValue(); Assert.assertNotNull(writable); Timestamp timestamp = new Timestamp(writable.getTimestamp().get()); Assert.assertEquals("2014-12-31 23:06:06.255", timestamp.toString()); FieldValueWritable[] writables = writable.getWritables(); for (int i = 0; i < chars.length(); i++) { String value = chars.substring(0, i); FieldValueWritable fieldValueWritable = writables[i]; Assert.assertEquals(value, fieldValueWritable.getData()); } // System.out.println(reader.getCurrentValue()); } } }
From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param job the job to sample/*from w w w. j a v a2s . c o m*/ * @param partFile where to write the output file to * @throws Throwable if something goes wrong */ public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable { long t1 = System.currentTimeMillis(); Configuration conf = job.getConfiguration(); final TeraInputFormat inFormat = new TeraInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); final List<InputSplit> splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; Thread[] samplerReader = new Thread[samples]; SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { final int idx = i; samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) { { setDaemon(true); } @Override public void run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context); reader.initialize(splits.get(sampleStep * idx), context); while (reader.nextKeyValue()) { sampler.addKey(new Text(reader.getCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.err.println( "Got an exception while reading splits " + StringUtils.stringifyException(ie)); throw new RuntimeException(ie); } catch (InterruptedException e) { } } }; samplerReader[i].start(); } FileSystem outFs = partFile.getFileSystem(conf); DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile)); for (int i = 0; i < samples; i++) { try { samplerReader[i].join(); if (threadGroup.getThrowable() != null) { throw threadGroup.getThrowable(); } } catch (InterruptedException e) { } } for (Text split : sampler.createPartitions(partitions)) { split.write(writer); } writer.close(); long t3 = System.currentTimeMillis(); System.out.println("Computing parititions took " + (t3 - t2) + "ms"); }
From source file:com.facebook.hiveio.benchmark.InputBenchmark.java
License:Apache License
/** * Run benchmark// w ww. j a va2 s .c om * * @param args parsed args * @throws Exception */ public void run(InputBenchmarkCmd args) throws Exception { HadoopNative.requireHadoopNative(); Timer allTime = Metrics.newTimer(InputBenchmark.class, "all-time", MILLISECONDS, MILLISECONDS); TimerContext allTimerContext = allTime.time(); HiveInputDescription input = new HiveInputDescription(); input.getTableDesc().setDatabaseName(args.tableOpts.database); input.getTableDesc().setTableName(args.tableOpts.table); input.setPartitionFilter(args.tableOpts.partitionFilter); input.getMetastoreDesc().setHost(args.metastoreOpts.host); input.getMetastoreDesc().setPort(args.metastoreOpts.port); HiveConf hiveConf = HiveUtils.newHiveConf(InputBenchmark.class); System.err.println("Initialize profile with input data"); HiveApiInputFormat.setProfileInputDesc(hiveConf, input, DEFAULT_PROFILE_ID); HiveApiInputFormat defaultInputFormat = new HiveApiInputFormat(); if (args.trackMetrics) { defaultInputFormat.setObserver(new MetricsObserver("default", args.recordPrintPeriod)); } List<InputSplit> splits = defaultInputFormat.getSplits(new JobContext(hiveConf, new JobID())); System.err.println("getSplits returned " + splits.size() + " splits"); long numRows = 0; for (int i = 0; i < splits.size(); ++i) { InputSplit split = splits.get(i); TaskAttemptID taskID = new TaskAttemptID(); TaskAttemptContext taskContext = new TaskAttemptContext(hiveConf, taskID); if (i % args.splitPrintPeriod == 0) { System.err.println("Handling split " + i + " of " + splits.size()); } RecordReader<WritableComparable, HiveReadableRecord> reader = defaultInputFormat .createRecordReader(split, taskContext); reader.initialize(split, taskContext); numRows += readFully(reader); } System.err.println("Parsed " + numRows + " rows"); allTimerContext.stop(); new ConsoleReporter(System.err).run(); }