List of usage examples for org.apache.hadoop.mapreduce TaskID TaskID
public TaskID()
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleInputOutputFormat.java
License:Apache License
public void testSplits(long maxSplitSize, int generatedRows) throws IOException, InterruptedException, IllegalArgumentException, SecurityException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { logger.info("Testing maxSplitSize: " + maxSplitSize + " and generatedRows:" + generatedRows); FileSystem fS = FileSystem.get(getConf()); Random r = new Random(1); Schema schema = new Schema("schema", Fields.parse("i:int,s:string")); ITuple tuple = new Tuple(schema); Path outPath = new Path(OUT); TupleFile.Writer writer = new TupleFile.Writer(FileSystem.get(getConf()), getConf(), outPath, schema); for (int i = 0; i < generatedRows; i++) { tuple.set("i", r.nextInt()); tuple.set("s", r.nextLong() + ""); writer.append(tuple);/*from w w w .j a v a2 s. com*/ } writer.close(); TupleInputFormat format = ReflectionUtils.newInstance(TupleInputFormat.class, getConf()); Job job = new Job(getConf()); FileInputFormat.setInputPaths(job, outPath); logger.info("Using max input split size: " + maxSplitSize); FileInputFormat.setMaxInputSplitSize(job, maxSplitSize); job.setInputFormatClass(FileInputFormat.class); // Read all the splits and count. The number of read rows must // be the same than the written ones. int count = 0; for (InputSplit split : format.getSplits(job)) { TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1); TaskAttemptContext attemptContext = TaskAttemptContextFactory.get(getConf(), attemptId); logger.info("Sampling split: " + split); RecordReader<ITuple, NullWritable> reader = format.createRecordReader(split, attemptContext); reader.initialize(split, attemptContext); while (reader.nextKeyValue()) { tuple = reader.getCurrentKey(); count++; } reader.close(); } assertEquals(generatedRows, count); HadoopUtils.deleteIfExists(fS, outPath); }
From source file:com.splout.db.hadoop.SchemaSampler.java
License:Apache License
public static Schema sample(Configuration conf, Path input, InputFormat<ITuple, NullWritable> inputFormat) throws IOException, InterruptedException { Schema schema = null;// www . j a va 2 s . c o m // sample schema from input path given the provided InputFormat @SuppressWarnings("deprecation") Job job = new Job(conf); FileInputFormat.setInputPaths(job, input); // get first inputSplit List<InputSplit> inputSplits = inputFormat.getSplits(job); if (inputSplits == null || inputSplits.size() == 0) { throw new IOException( "Given input format doesn't produce any input split. Can't sample first record. PATH: " + input); } InputSplit inputSplit = inputSplits.get(0); TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1); TaskAttemptContext attemptContext; try { attemptContext = TaskAttemptContextFactory.get(conf, attemptId); } catch (Exception e) { throw new IOException(e); } RecordReader<ITuple, NullWritable> rReader = inputFormat.createRecordReader(inputSplit, attemptContext); rReader.initialize(inputSplit, attemptContext); if (!rReader.nextKeyValue()) { throw new IOException( "Can't read first record of first input split of the given path [" + input + "]."); } // finally get the sample schema schema = rReader.getCurrentKey().getSchema(); log.info("Sampled schema from [" + input + "] : " + schema); rReader.close(); return schema; }
From source file:com.splout.db.hadoop.TupleSampler.java
License:Apache License
/** * Random sampling method a-la-TeraSort, getting some consecutive samples from each InputSplit * without using a Job.//from w w w.j av a 2 s . c o m * The output is SequenceFile with keys. * * @return The number of retrieved samples */ private long randomSampling(long sampleSize, Configuration hadoopConf, Path outFile, List<InputSplit> splits, Map<InputSplit, TableSpec> splitToTableSpec, Map<InputSplit, InputFormat<ITuple, NullWritable>> splitToFormat, Map<InputSplit, Map<String, String>> specificHadoopConf, Map<InputSplit, RecordProcessor> recordProcessorPerSplit, Map<InputSplit, JavascriptEngine> splitToJsEngine, int maxSplitsToVisit) throws IOException { // Instantiate the writer we will write samples to FileSystem fs = FileSystem.get(outFile.toUri(), hadoopConf); if (splits.size() == 0) { throw new IllegalArgumentException("There are no splits to sample from!"); } @SuppressWarnings("deprecation") SequenceFile.Writer writer = new SequenceFile.Writer(fs, hadoopConf, outFile, Text.class, NullWritable.class); logger.info("Sequential sampling options, max splits to visit: " + maxSplitsToVisit + ", samples to take: " + sampleSize + ", total number of splits: " + splits.size()); int blocks = Math.min(maxSplitsToVisit, splits.size()); blocks = Math.min((int) sampleSize, blocks); long recordsPerSample = sampleSize / blocks; int sampleStep = splits.size() / blocks; long records = 0; CounterInterface counterInterface = new CounterInterface(null) { public Counter getCounter(String group, String name) { return Mockito.mock(Counter.class); } ; }; // Take N samples from different parts of the input for (int i = 0; i < blocks; ++i) { TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1); TaskAttemptContext attemptContext = null; try { attemptContext = TaskAttemptContextFactory.get(hadoopConf, attemptId); } catch (Exception e) { throw new RuntimeException(e); } InputSplit split = splits.get(sampleStep * i); if (specificHadoopConf.get(split) != null) { for (Map.Entry<String, String> specificConf : specificHadoopConf.get(split).entrySet()) { attemptContext.getConfiguration().set(specificConf.getKey(), specificConf.getValue()); } } logger.info("Sampling split: " + split); RecordReader<ITuple, NullWritable> reader = null; try { reader = splitToFormat.get(split).createRecordReader(split, attemptContext); reader.initialize(split, attemptContext); RecordProcessor processor = recordProcessorPerSplit.get(split); Text key = new Text(); while (reader.nextKeyValue()) { // ITuple tuple = reader.getCurrentKey(); ITuple uTuple; try { uTuple = processor.process(tuple, tuple.getSchema().getName(), counterInterface); } catch (Throwable e) { throw new RuntimeException(e); } if (uTuple != null) { // user may have filtered the record try { key.set(TablespaceGenerator.getPartitionByKey(uTuple, splitToTableSpec.get(split), splitToJsEngine.get(split))); } catch (Throwable e) { throw new RuntimeException("Error when determining partition key.", e); } writer.append(key, NullWritable.get()); records += 1; if ((i + 1) * recordsPerSample <= records) { break; } } } } catch (InterruptedException e) { throw new RuntimeException(e); } } writer.close(); return records; }
From source file:org.apache.hcatalog.shims.HCatHadoopShims20S.java
License:Apache License
@Override public TaskID createTaskID() { return new TaskID(); }
From source file:org.apache.rya.accumulo.mr.GraphXEdgeInputFormatTest.java
License:Apache License
@SuppressWarnings("rawtypes") @Test/*from ww w . ja va 2 s. c o m*/ public void testInputFormat() throws Exception { RyaStatement input = RyaStatement.builder().setSubject(new RyaURI("http://www.google.com")) .setPredicate(new RyaURI("http://some_other_uri")).setObject(new RyaURI("http://www.yahoo.com")) .setColumnVisibility(new byte[0]).setValue(new byte[0]).build(); apiImpl.add(input); Job jobConf = Job.getInstance(); GraphXEdgeInputFormat.setMockInstance(jobConf, instance.getInstanceName()); GraphXEdgeInputFormat.setConnectorInfo(jobConf, username, password); GraphXEdgeInputFormat.setTableLayout(jobConf, TABLE_LAYOUT.SPO); GraphXEdgeInputFormat.setInputTableName(jobConf, table); GraphXEdgeInputFormat.setInputTableName(jobConf, table); GraphXEdgeInputFormat.setScanIsolation(jobConf, false); GraphXEdgeInputFormat.setLocalIterators(jobConf, false); GraphXEdgeInputFormat.setOfflineTableScan(jobConf, false); GraphXEdgeInputFormat inputFormat = new GraphXEdgeInputFormat(); JobContext context = new JobContextImpl(jobConf.getConfiguration(), jobConf.getJobID()); List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(context.getConfiguration(), new TaskAttemptID(new TaskID(), 1)); RecordReader reader = inputFormat.createRecordReader(splits.get(0), taskAttemptContext); RecordReader ryaStatementRecordReader = (RecordReader) reader; ryaStatementRecordReader.initialize(splits.get(0), taskAttemptContext); List<Edge> results = new ArrayList<Edge>(); while (ryaStatementRecordReader.nextKeyValue()) { Edge writable = (Edge) ryaStatementRecordReader.getCurrentValue(); long srcId = writable.srcId(); long destId = writable.dstId(); RyaTypeWritable rtw = null; Object text = ryaStatementRecordReader.getCurrentKey(); Edge<RyaTypeWritable> edge = new Edge<RyaTypeWritable>(srcId, destId, rtw); results.add(edge); System.out.println(text); } System.out.println(results.size()); System.out.println(results); Assert.assertTrue(results.size() == 2); }
From source file:org.apache.rya.accumulo.mr.GraphXInputFormatTest.java
License:Apache License
@Test public void testInputFormat() throws Exception { RyaStatement input = RyaStatement.builder().setSubject(new RyaURI("http://www.google.com")) .setPredicate(new RyaURI("http://some_other_uri")).setObject(new RyaURI("http://www.yahoo.com")) .setColumnVisibility(new byte[0]).setValue(new byte[0]).build(); apiImpl.add(input);// w w w . j av a 2 s. c o m Job jobConf = Job.getInstance(); GraphXInputFormat.setMockInstance(jobConf, instance.getInstanceName()); GraphXInputFormat.setConnectorInfo(jobConf, username, password); GraphXInputFormat.setInputTableName(jobConf, table); GraphXInputFormat.setInputTableName(jobConf, table); GraphXInputFormat.setScanIsolation(jobConf, false); GraphXInputFormat.setLocalIterators(jobConf, false); GraphXInputFormat.setOfflineTableScan(jobConf, false); GraphXInputFormat inputFormat = new GraphXInputFormat(); JobContext context = new JobContextImpl(jobConf.getConfiguration(), jobConf.getJobID()); List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(context.getConfiguration(), new TaskAttemptID(new TaskID(), 1)); RecordReader<Object, RyaTypeWritable> reader = inputFormat.createRecordReader(splits.get(0), taskAttemptContext); RyaStatementRecordReader ryaStatementRecordReader = (RyaStatementRecordReader) reader; ryaStatementRecordReader.initialize(splits.get(0), taskAttemptContext); List<RyaType> results = new ArrayList<RyaType>(); System.out.println("before while"); while (ryaStatementRecordReader.nextKeyValue()) { System.out.println("in while"); RyaTypeWritable writable = ryaStatementRecordReader.getCurrentValue(); RyaType value = writable.getRyaType(); Object text = ryaStatementRecordReader.getCurrentKey(); RyaType type = new RyaType(); type.setData(value.getData()); type.setDataType(value.getDataType()); results.add(type); System.out.println(value.getData()); System.out.println(value.getDataType()); System.out.println(results); System.out.println(type); System.out.println(text); System.out.println(value); } System.out.println("after while"); System.out.println(results.size()); System.out.println(results); // Assert.assertTrue(results.size() == 2); // Assert.assertTrue(results.contains(input)); }
From source file:org.apache.rya.accumulo.mr.RyaInputFormatTest.java
License:Apache License
@Test public void testInputFormat() throws Exception { RyaStatement input = RyaStatement.builder().setSubject(new RyaURI("http://www.google.com")) .setPredicate(new RyaURI("http://some_other_uri")).setObject(new RyaURI("http://www.yahoo.com")) .setColumnVisibility(new byte[0]).setValue(new byte[0]).build(); apiImpl.add(input);/*from www.ja v a 2s.c o m*/ Job jobConf = Job.getInstance(); RyaInputFormat.setMockInstance(jobConf, instance.getInstanceName()); RyaInputFormat.setConnectorInfo(jobConf, username, password); RyaInputFormat.setTableLayout(jobConf, TABLE_LAYOUT.SPO); AccumuloInputFormat.setInputTableName(jobConf, table); AccumuloInputFormat.setInputTableName(jobConf, table); AccumuloInputFormat.setScanIsolation(jobConf, false); AccumuloInputFormat.setLocalIterators(jobConf, false); AccumuloInputFormat.setOfflineTableScan(jobConf, false); RyaInputFormat inputFormat = new RyaInputFormat(); JobContext context = new JobContextImpl(jobConf.getConfiguration(), jobConf.getJobID()); List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(context.getConfiguration(), new TaskAttemptID(new TaskID(), 1)); RecordReader<Text, RyaStatementWritable> reader = inputFormat.createRecordReader(splits.get(0), taskAttemptContext); RyaStatementRecordReader ryaStatementRecordReader = (RyaStatementRecordReader) reader; ryaStatementRecordReader.initialize(splits.get(0), taskAttemptContext); List<RyaStatement> results = new ArrayList<RyaStatement>(); while (ryaStatementRecordReader.nextKeyValue()) { RyaStatementWritable writable = ryaStatementRecordReader.getCurrentValue(); RyaStatement value = writable.getRyaStatement(); Text text = ryaStatementRecordReader.getCurrentKey(); RyaStatement stmt = RyaStatement.builder().setSubject(value.getSubject()) .setPredicate(value.getPredicate()).setObject(value.getObject()).setContext(value.getContext()) .setQualifier(value.getQualifer()).setColumnVisibility(value.getColumnVisibility()) .setValue(value.getValue()).build(); results.add(stmt); System.out.println(text); System.out.println(value); } Assert.assertTrue(results.size() == 2); Assert.assertTrue(results.contains(input)); }