List of usage examples for org.apache.hadoop.mapreduce RecordReader initialize
public abstract void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException;
From source file:org.apache.mahout.classifier.df.mapreduce.partial.PartialSequentialBuilder.java
License:Apache License
@Override protected boolean runJob(Job job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(job); int nbSplits = splits.size(); log.debug("Nb splits : {}", nbSplits); InputSplit[] sorted = new InputSplit[nbSplits]; splits.toArray(sorted);//from w w w. java 2 s .c o m Builder.sortSplits(sorted); int numTrees = Builder.getNbTrees(conf); // total number of trees TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID()); firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees); /* first instance id in hadoop's order */ //int[] firstIds = new int[nbSplits]; /* partitions' sizes in hadoop order */ int[] sizes = new int[nbSplits]; // to compute firstIds, process the splits in file order long slowest = 0; // duration of slowest map int firstId = 0; for (InputSplit split : splits) { int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task); reader.initialize(split, task); Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, nbSplits, numTrees); long time = System.currentTimeMillis(); //firstIds[hp] = firstId; while (reader.nextKeyValue()) { mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput); firstId++; sizes[hp]++; } mapper.cleanup(firstOutput); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); return true; }
From source file:org.apache.mahout.df.mapreduce.partial.PartialSequentialBuilder.java
License:Apache License
@Override protected boolean runJob(Job job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(job); int nbSplits = splits.size(); log.debug("Nb splits : {}", nbSplits); InputSplit[] sorted = new InputSplit[nbSplits]; splits.toArray(sorted);//www . jav a2 s . c o m Builder.sortSplits(sorted); int numTrees = Builder.getNbTrees(conf); // total number of trees TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID()); firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees); firstIds = new int[nbSplits]; sizes = new int[nbSplits]; // to compute firstIds, process the splits in file order long slowest = 0; // duration of slowest map int firstId = 0; for (int p = 0; p < nbSplits; p++) { InputSplit split = splits.get(p); int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task); reader.initialize(split, task); Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, nbSplits, numTrees); long time = System.currentTimeMillis(); firstIds[hp] = firstId; while (reader.nextKeyValue()) { mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput); firstId++; sizes[hp]++; } mapper.cleanup(firstOutput); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); return true; }
From source file:org.apache.mahout.df.mapreduce.partial.Step0JobTest.java
License:Apache License
public void testStep0Mapper() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES); double[][] source = Utils.randomDoubles(rng, descriptor, NUM_INSTANCES); String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); Job job = new Job(); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.setInputPaths(job, dataPath); setMaxSplitSize(job.getConfiguration(), dataPath, NUM_MAPS); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(job); assertEquals(NUM_MAPS, splits.size()); InputSplit[] sorted = new InputSplit[NUM_MAPS]; splits.toArray(sorted);/*w ww . j a v a 2 s .com*/ Builder.sortSplits(sorted); Step0Context context = new Step0Context(new Step0Mapper(), job.getConfiguration(), new TaskAttemptID(), NUM_MAPS); for (int p = 0; p < NUM_MAPS; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.createRecordReader(split, context); reader.initialize(split, context); Step0Mapper mapper = new Step0Mapper(); mapper.configure(p); Long firstKey = null; int size = 0; while (reader.nextKeyValue()) { LongWritable key = reader.getCurrentKey(); if (firstKey == null) { firstKey = key.get(); } mapper.map(key, reader.getCurrentValue(), context); size++; } mapper.cleanup(context); // validate the mapper's output assertEquals(p, context.keys[p]); assertEquals(firstKey.longValue(), context.values[p].getFirstId()); assertEquals(size, context.values[p].getSize()); } }
From source file:org.apache.mahout.df.mapreduce.partial.Step0JobTest.java
License:Apache License
public void testProcessOutput() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES); double[][] source = Utils.randomDoubles(rng, descriptor, NUM_INSTANCES); // each instance label is its index in the dataset int labelId = Utils.findLabel(descriptor); for (int index = 0; index < NUM_INSTANCES; index++) { source[index][labelId] = index;/*from w w w .java 2 s .c om*/ } String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); // prepare a data converter Dataset dataset = DataLoader.generateDataset(descriptor, sData); DataConverter converter = new DataConverter(dataset); Job job = new Job(); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.setInputPaths(job, dataPath); setMaxSplitSize(job.getConfiguration(), dataPath, NUM_MAPS); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(job); assertEquals(NUM_MAPS, splits.size()); InputSplit[] sorted = new InputSplit[NUM_MAPS]; splits.toArray(sorted); Builder.sortSplits(sorted); List<Integer> keys = new ArrayList<Integer>(); List<Step0Output> values = new ArrayList<Step0Output>(); int[] expectedIds = new int[NUM_MAPS]; TaskAttemptContext context = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()); for (int p = 0; p < NUM_MAPS; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.createRecordReader(split, context); reader.initialize(split, context); Long firstKey = null; int size = 0; while (reader.nextKeyValue()) { LongWritable key = reader.getCurrentKey(); Text value = reader.getCurrentValue(); if (firstKey == null) { firstKey = key.get(); expectedIds[p] = converter.convert(0, value.toString()).getLabel(); } size++; } keys.add(p); values.add(new Step0Output(firstKey, size)); } Step0Output[] partitions = Step0Job.processOutput(keys, values); int[] actualIds = Step0Output.extractFirstIds(partitions); assertTrue("Expected: " + Arrays.toString(expectedIds) + " But was: " + Arrays.toString(actualIds), Arrays.equals(expectedIds, actualIds)); }
From source file:org.apache.parquet.hadoop.thrift.TestParquetToThriftReadWriteAndProjection.java
License:Apache License
private <T extends TBase<?, ?>> void shouldDoProjection(Configuration conf, T recordToWrite, T exptectedReadResult, Class<? extends TBase<?, ?>> thriftClass) throws Exception { final Path parquetFile = new Path("target/test/TestParquetToThriftReadWriteAndProjection/file.parquet"); final FileSystem fs = parquetFile.getFileSystem(conf); if (fs.exists(parquetFile)) { fs.delete(parquetFile, true);/*from w w w.jav a 2s . co m*/ } //create a test file final TProtocolFactory protocolFactory = new TCompactProtocol.Factory(); final TaskAttemptID taskId = new TaskAttemptID("local", 0, true, 0, 0); final ThriftToParquetFileWriter w = new ThriftToParquetFileWriter(parquetFile, ContextUtil.newTaskAttemptContext(conf, taskId), protocolFactory, thriftClass); final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final TProtocol protocol = protocolFactory.getProtocol(new TIOStreamTransport(baos)); recordToWrite.write(protocol); w.write(new BytesWritable(baos.toByteArray())); w.close(); final ParquetThriftInputFormat<T> parquetThriftInputFormat = new ParquetThriftInputFormat<T>(); final Job job = new Job(conf, "read"); job.setInputFormatClass(ParquetThriftInputFormat.class); ParquetThriftInputFormat.setInputPaths(job, parquetFile); final JobID jobID = new JobID("local", 1); List<InputSplit> splits = parquetThriftInputFormat .getSplits(ContextUtil.newJobContext(ContextUtil.getConfiguration(job), jobID)); T readValue = null; for (InputSplit split : splits) { TaskAttemptContext taskAttemptContext = ContextUtil.newTaskAttemptContext( ContextUtil.getConfiguration(job), new TaskAttemptID(new TaskID(jobID, true, 1), 0)); final RecordReader<Void, T> reader = parquetThriftInputFormat.createRecordReader(split, taskAttemptContext); reader.initialize(split, taskAttemptContext); if (reader.nextKeyValue()) { readValue = reader.getCurrentValue(); LOG.info(readValue); } } assertEquals(exptectedReadResult, readValue); }
From source file:org.apache.parquet.pig.PerfTest2.java
License:Apache License
static void load(String out, int colsToLoad, StringBuilder results) throws Exception { StringBuilder schemaString = new StringBuilder("a0: chararray"); for (int i = 1; i < colsToLoad; i++) { schemaString.append(", a" + i + ": chararray"); }/* ww w.j av a2s. c o m*/ long t0 = System.currentTimeMillis(); Job job = new Job(conf); int loadjobId = jobid++; LoadFunc loadFunc = new ParquetLoader(schemaString.toString()); loadFunc.setUDFContextSignature("sigLoader" + loadjobId); String absPath = loadFunc.relativeToAbsolutePath(out, new Path(new File(".").getAbsoluteFile().toURI())); loadFunc.setLocation(absPath, job); @SuppressWarnings("unchecked") // that's how the base class is defined InputFormat<Void, Tuple> inputFormat = loadFunc.getInputFormat(); JobContext jobContext = ContextUtil.newJobContext(ContextUtil.getConfiguration(job), new JobID("jt", loadjobId)); List<InputSplit> splits = inputFormat.getSplits(jobContext); int i = 0; int taskid = 0; for (InputSplit split : splits) { TaskAttemptContext taskAttemptContext = ContextUtil.newTaskAttemptContext( ContextUtil.getConfiguration(job), new TaskAttemptID("jt", loadjobId, true, taskid++, 0)); RecordReader<Void, Tuple> recordReader = inputFormat.createRecordReader(split, taskAttemptContext); loadFunc.prepareToRead(recordReader, null); recordReader.initialize(split, taskAttemptContext); Tuple t; while ((t = loadFunc.getNext()) != null) { if (Log.DEBUG) System.out.println(t); ++i; } } assertEquals(ROW_COUNT, i); long t1 = System.currentTimeMillis(); results.append((t1 - t0) + " ms to read " + colsToLoad + " columns\n"); }
From source file:org.apache.pig.builtin.AvroStorage.java
License:Apache License
/** * @see org.apache.pig.LoadFunc#getInputFormat() *//*from w w w. j a v a2s . co m*/ @Override public InputFormat<NullWritable, GenericData.Record> getInputFormat() throws IOException { return new org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigFileInputFormat<NullWritable, GenericData.Record>() { @Override public RecordReader<NullWritable, GenericData.Record> createRecordReader(final InputSplit is, final TaskAttemptContext tc) throws IOException, InterruptedException { Schema s = getInputAvroSchema(); RecordReader<NullWritable, GenericData.Record> rr = null; if (s.getType() == Type.ARRAY) { rr = new AvroArrayReader(s); } else { rr = new AvroRecordReader(s); } rr.initialize(is, tc); tc.setStatus(is.toString()); return rr; } }; }
From source file:org.apache.rya.accumulo.mr.GraphXEdgeInputFormatTest.java
License:Apache License
@SuppressWarnings("rawtypes") @Test/*from w w w . j a v a2 s .com*/ public void testInputFormat() throws Exception { RyaStatement input = RyaStatement.builder().setSubject(new RyaURI("http://www.google.com")) .setPredicate(new RyaURI("http://some_other_uri")).setObject(new RyaURI("http://www.yahoo.com")) .setColumnVisibility(new byte[0]).setValue(new byte[0]).build(); apiImpl.add(input); Job jobConf = Job.getInstance(); GraphXEdgeInputFormat.setMockInstance(jobConf, instance.getInstanceName()); GraphXEdgeInputFormat.setConnectorInfo(jobConf, username, password); GraphXEdgeInputFormat.setTableLayout(jobConf, TABLE_LAYOUT.SPO); GraphXEdgeInputFormat.setInputTableName(jobConf, table); GraphXEdgeInputFormat.setInputTableName(jobConf, table); GraphXEdgeInputFormat.setScanIsolation(jobConf, false); GraphXEdgeInputFormat.setLocalIterators(jobConf, false); GraphXEdgeInputFormat.setOfflineTableScan(jobConf, false); GraphXEdgeInputFormat inputFormat = new GraphXEdgeInputFormat(); JobContext context = new JobContextImpl(jobConf.getConfiguration(), jobConf.getJobID()); List<InputSplit> splits = inputFormat.getSplits(context); Assert.assertEquals(1, splits.size()); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(context.getConfiguration(), new TaskAttemptID(new TaskID(), 1)); RecordReader reader = inputFormat.createRecordReader(splits.get(0), taskAttemptContext); RecordReader ryaStatementRecordReader = (RecordReader) reader; ryaStatementRecordReader.initialize(splits.get(0), taskAttemptContext); List<Edge> results = new ArrayList<Edge>(); while (ryaStatementRecordReader.nextKeyValue()) { Edge writable = (Edge) ryaStatementRecordReader.getCurrentValue(); long srcId = writable.srcId(); long destId = writable.dstId(); RyaTypeWritable rtw = null; Object text = ryaStatementRecordReader.getCurrentKey(); Edge<RyaTypeWritable> edge = new Edge<RyaTypeWritable>(srcId, destId, rtw); results.add(edge); System.out.println(text); } System.out.println(results.size()); System.out.println(results); Assert.assertTrue(results.size() == 2); }
From source file:org.apache.tinkerpop.gremlin.hadoop.structure.io.graphson.GraphSONInputFormat.java
License:Apache License
@Override public RecordReader<NullWritable, VertexWritable> createRecordReader(final InputSplit split, final TaskAttemptContext context) throws IOException, InterruptedException { RecordReader<NullWritable, VertexWritable> reader = new GraphSONRecordReader(); reader.initialize(split, context); return reader; }
From source file:org.apache.tinkerpop.gremlin.hadoop.structure.io.graphson.GraphSONLegacyInputFormat.java
License:Apache License
@Override public RecordReader<NullWritable, VertexWritable> createRecordReader(final InputSplit split, final TaskAttemptContext context) throws IOException, InterruptedException { RecordReader<NullWritable, VertexWritable> reader = new GraphSONLegacyRecordReader(); reader.initialize(split, context); return reader; }