List of usage examples for org.apache.hadoop.mapreduce RecordReader initialize
public abstract void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException;
From source file:fi.tkk.ics.hadoop.bam.VCFInputFormat.java
License:Open Source License
/** Returns a {@link BCFRecordReader} or {@link VCFRecordReader} as * appropriate, initialized with the given parameters. * * <p>Throws {@link IllegalArgumentException} if the given input split is * not a {@link FileVirtualSplit} or a {@link FileSplit}, or if the path * referred to is not recognized as a VCF or BCF file (see {@link * #getFormat}).</p>//from w w w . ja va 2s . co m */ @Override public RecordReader<LongWritable, VariantContextWritable> createRecordReader(InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException { final Path path; if (split instanceof FileSplit) path = ((FileSplit) split).getPath(); else if (split instanceof FileVirtualSplit) path = ((FileVirtualSplit) split).getPath(); else throw new IllegalArgumentException("split '" + split + "' has unknown type: cannot extract path"); if (this.conf == null) this.conf = ContextUtil.getConfiguration(ctx); final VCFFormat fmt = getFormat(path); if (fmt == null) throw new IllegalArgumentException("unknown VCF format, cannot create RecordReader: " + path); final RecordReader<LongWritable, VariantContextWritable> rr; switch (fmt) { case VCF: rr = new VCFRecordReader(); break; case BCF: rr = new BCFRecordReader(); break; default: assert false; return null; } rr.initialize(split, ctx); return rr; }
From source file:gobblin.source.extractor.hadoop.HadoopFileInputSource.java
License:Apache License
@Override public Extractor<S, D> getExtractor(WorkUnitState workUnitState) throws IOException { if (!workUnitState.contains(FILE_SPLIT_BYTES_STRING_KEY)) { throw new IOException("No serialized FileSplit found in WorkUnitState " + workUnitState.getId()); }/* ww w .j a v a2 s . com*/ Configuration configuration = new Configuration(); FileInputFormat<K, V> fileInputFormat = getFileInputFormat(workUnitState, configuration); String fileSplitBytesStr = workUnitState.getProp(FILE_SPLIT_BYTES_STRING_KEY); FileSplit fileSplit = (FileSplit) HadoopUtils.deserializeFromString(FileSplit.class, fileSplitBytesStr); TaskAttemptContext taskAttemptContext = getTaskAttemptContext(configuration, DummyTaskAttemptIDFactory.newTaskAttemptID()); try { RecordReader<K, V> recordReader = fileInputFormat.createRecordReader(fileSplit, taskAttemptContext); recordReader.initialize(fileSplit, taskAttemptContext); boolean readKeys = workUnitState.getPropAsBoolean(FILE_INPUT_READ_KEYS_KEY, DEFAULT_FILE_INPUT_READ_KEYS); return getExtractor(workUnitState, recordReader, fileSplit, readKeys); } catch (InterruptedException ie) { throw new IOException(ie); } }
From source file:info.halo9pan.word2vec.hadoop.mr.SortInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * //from w w w. java 2s. co m * @param job * the job to sample * @param partFile * where to write the output file to * @throws Throwable * if something goes wrong */ public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable { long t1 = System.currentTimeMillis(); Configuration conf = job.getConfiguration(); final SortInputFormat inFormat = new SortInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); final List<InputSplit> splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; Thread[] samplerReader = new Thread[samples]; SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { final int idx = i; samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) { { setDaemon(true); } public void run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context); reader.initialize(splits.get(sampleStep * idx), context); while (reader.nextKeyValue()) { sampler.addKey(new Text(reader.getCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.err.println( "Got an exception while reading splits " + StringUtils.stringifyException(ie)); throw new RuntimeException(ie); } catch (InterruptedException e) { } } }; samplerReader[i].start(); } FileSystem outFs = partFile.getFileSystem(conf); DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile)); for (int i = 0; i < samples; i++) { try { samplerReader[i].join(); if (threadGroup.getThrowable() != null) { throw threadGroup.getThrowable(); } } catch (InterruptedException e) { } } for (Text split : sampler.createPartitions(partitions)) { split.write(writer); } writer.close(); long t3 = System.currentTimeMillis(); System.out.println("Computing parititions took " + (t3 - t2) + "ms"); }
From source file:io.druid.data.input.orc.DruidOrcInputFormatTest.java
License:Apache License
@Test public void testRead() throws IOException, InterruptedException { InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration()); TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader reader = inputFormat.createRecordReader(split, context); OrcHadoopInputRowParser parser = (OrcHadoopInputRowParser) config.getParser(); reader.initialize(split, context); reader.nextKeyValue();/*from w w w. j a va 2 s . com*/ OrcStruct data = (OrcStruct) reader.getCurrentValue(); MapBasedInputRow row = (MapBasedInputRow) parser.parse(data); Assert.assertTrue(row.getEvent().keySet().size() == 4); Assert.assertEquals(new DateTime(timestamp), row.getTimestamp()); Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions()); Assert.assertEquals(col1, row.getEvent().get("col1")); Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2")); reader.close(); }
From source file:io.druid.data.input.parquet.DruidParquetInputFormatTest.java
License:Apache License
@Test public void test() throws IOException, InterruptedException { Configuration conf = new Configuration(); Job job = Job.getInstance(conf);/*from www . j a va 2 s .c o m*/ HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig .fromFile(new File("example/wikipedia_hadoop_parquet_job.json")); config.intoConfiguration(job); File testFile = new File("example/wikipedia_list.parquet"); Path path = new Path(testFile.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat inputFormat = ReflectionUtils.newInstance(DruidParquetInputFormat.class, job.getConfiguration()); TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader reader = inputFormat.createRecordReader(split, context); reader.initialize(split, context); reader.nextKeyValue(); GenericRecord data = (GenericRecord) reader.getCurrentValue(); // field not read, should return null assertEquals(data.get("added"), null); assertEquals(data.get("page"), new Utf8("Gypsy Danger")); reader.close(); }
From source file:io.ssc.trackthetrackers.extraction.hadoop.util.Compaction.java
License:Open Source License
public static void main(String[] args) throws IOException, InterruptedException { if (args.length != 2) { System.out.println("Usage: <input folder> <output file>"); System.exit(-1);//from w w w. j ava2 s . c o m } String inputPath = args[0]; String outputFile = args[1]; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus[] input = fs.listStatus(new Path(inputPath), new PathFilter() { @Override public boolean accept(Path path) { return path.toString().endsWith(".parquet"); } }); Path output = new Path(outputFile); fs.delete(output, true); ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder> inputFormat = new ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder>(); inputFormat.setReadSupportClass(new JobConf(conf), ProtoReadSupport.class); Job job = new Job(conf); ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage> outputFormat = new ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage>( ParsedPageProtos.ParsedPage.class); ProtoParquetOutputFormat.setProtobufClass(job, ParsedPageProtos.ParsedPage.class); ProtoParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ProtoParquetOutputFormat.setEnableDictionary(job, true); RecordWriter<Void, ParsedPageProtos.ParsedPage> recordWriter = outputFormat.getRecordWriter(conf, output, CompressionCodecName.SNAPPY); List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>(); for (FileStatus fileStatus : input) { System.out.println(fileStatus.getPath().toString()); splits.addAll(inputFormat.getSplits(conf, ParquetFileReader.readFooters(conf, fileStatus))); } int splitIndex = 0; for (ParquetInputSplit split : splits) { System.out.println("Processing split: " + split.getPath().toString() + "(" + splitIndex + " of " + splits.size() + ")"); TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", splitIndex, true, splitIndex), splitIndex); TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID); RecordReader<Void, ParsedPageProtos.ParsedPageOrBuilder> reader = inputFormat.createRecordReader(split, ctx); reader.initialize(split, ctx); while (reader.nextKeyValue()) { ParsedPageProtos.ParsedPageOrBuilder record = reader.getCurrentValue(); ParsedPageProtos.ParsedPage.Builder builder = ParsedPageProtos.ParsedPage.newBuilder(); builder.setUrl(record.getUrl()); builder.setArchiveTime(record.getArchiveTime()); builder.addAllScripts(record.getScriptsList()); builder.addAllIframes(record.getIframesList()); builder.addAllLinks(record.getLinksList()); builder.addAllImages(record.getImagesList()); recordWriter.write(null, builder.build()); } if (reader != null) { reader.close(); } splitIndex++; } TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", 1, true, 1), 1); TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID); if (recordWriter != null) { recordWriter.close(ctx); } }
From source file:it.crs4.pydoop.mapreduce.pipes.PipesMapper.java
License:Apache License
@Override public void run(Context context) throws IOException, InterruptedException { setup(context);//w w w . ja va 2 s . c om Configuration conf = context.getConfiguration(); InputSplit split = context.getInputSplit(); // FIXME: do we really need to be so convoluted? InputFormat<K1, V1> inputFormat; try { inputFormat = (InputFormat<K1, V1>) ReflectionUtils.newInstance(context.getInputFormatClass(), conf); } catch (ClassNotFoundException ce) { throw new RuntimeException("class not found", ce); } RecordReader<K1, V1> input = inputFormat.createRecordReader(split, context); input.initialize(split, context); boolean isJavaInput = Submitter.getIsJavaRecordReader(conf); try { // FIXME: what happens for a java mapper and no java record reader? DummyRecordReader fakeInput = (!isJavaInput && !Submitter.getIsJavaMapper(conf)) ? (DummyRecordReader) input : null; application = new Application<K1, V1, K2, V2>(context, fakeInput); } catch (InterruptedException ie) { throw new RuntimeException("interrupted", ie); } DownwardProtocol<K1, V1> downlink = application.getDownlink(); // FIXME: InputSplit is not Writable, but still, this is ugly... downlink.runMap((FileSplit) context.getInputSplit(), context.getNumReduceTasks(), isJavaInput); boolean skipping = conf.getBoolean(context.SKIP_RECORDS, false); boolean sent_input_types = false; try { if (isJavaInput) { // FIXME while (input.nextKeyValue()) { if (!sent_input_types) { sent_input_types = true; NullWritable n = NullWritable.get(); String kclass_name = n.getClass().getName(); String vclass_name = n.getClass().getName(); if (input.getCurrentKey() != null) { kclass_name = input.getCurrentKey().getClass().getName(); } if (input.getCurrentValue() != null) { vclass_name = input.getCurrentValue().getClass().getName(); } downlink.setInputTypes(kclass_name, vclass_name); } downlink.mapItem(input.getCurrentKey(), input.getCurrentValue()); if (skipping) { //flush the streams on every record input if running in skip mode //so that we don't buffer other records surrounding a bad record. downlink.flush(); } } downlink.endOfInput(); } application.waitForFinish(); } catch (Throwable t) { application.abort(t); } finally { cleanup(context); } }
From source file:it.crs4.seal.tsv_sort.TextSampler.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 20 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param inFormat The input to sample//from ww w.ja v a 2s. com * @param conf the job to sample * @param partFile where to write the output file to * @throws IOException if something goes wrong */ public static void writePartitionFile(FileInputFormat<Text, Text> inFormat, JobContext job, Path partFile) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); TaskAttemptContext taskContext = Utils.getTaskAttemptContext(conf); TextSampler sampler = new TextSampler(); Text key = new Text(); Text value = new Text(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE_CONF, SAMPLE_SIZE_DEFAULT); List<InputSplit> splits = inFormat.getSplits(job); int samples = Math.min(MAX_SLICES_SAMPLED, splits.size()); long recordsPerSample = sampleSize / samples; int sampleStep = splits.size() / samples; long records = 0; // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { InputSplit isplit = splits.get(sampleStep * i); RecordReader<Text, Text> reader = inFormat.createRecordReader(isplit, taskContext); reader.initialize(isplit, taskContext); while (reader.nextKeyValue()) { sampler.addKey(reader.getCurrentKey()); records += 1; if ((i + 1) * recordsPerSample <= records) { break; } } } FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) outFs.delete(partFile, false); SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); for (Text split : sampler.createPartitions(partitions)) { writer.append(split, nullValue); } writer.close(); }
From source file:mvm.rya.accumulo.pig.AccumuloStorageTest.java
License:Apache License
protected List<AccumuloStorage> createAccumuloStorages(String location) throws IOException, InterruptedException { List<AccumuloStorage> accumuloStorages = new ArrayList<AccumuloStorage>(); AccumuloStorage storage = new AccumuloStorage(); InputFormat inputFormat = storage.getInputFormat(); Job job = new Job(new Configuration()); storage.setLocation(location, job);//ww w. j a v a2s .c o m List<InputSplit> splits = inputFormat.getSplits(job); assertNotNull(splits); for (InputSplit inputSplit : splits) { storage = new AccumuloStorage(); job = new Job(new Configuration()); storage.setLocation(location, job); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID("jtid", 0, false, 0, 0)); RecordReader recordReader = inputFormat.createRecordReader(inputSplit, taskAttemptContext); recordReader.initialize(inputSplit, taskAttemptContext); storage.prepareToRead(recordReader, null); accumuloStorages.add(storage); } return accumuloStorages; }
From source file:mvm.rya.accumulo.pig.StatementPatternStorageTest.java
License:Apache License
protected List<StatementPatternStorage> createStorages(String location) throws IOException, InterruptedException { List<StatementPatternStorage> storages = new ArrayList<StatementPatternStorage>(); StatementPatternStorage storage = new StatementPatternStorage(); InputFormat inputFormat = storage.getInputFormat(); Job job = new Job(new Configuration()); storage.setLocation(location, job);//from ww w . ja v a 2s . c o m List<InputSplit> splits = inputFormat.getSplits(job); assertNotNull(splits); for (InputSplit inputSplit : splits) { storage = new StatementPatternStorage(); job = new Job(new Configuration()); storage.setLocation(location, job); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID("jtid", 0, false, 0, 0)); RecordReader recordReader = inputFormat.createRecordReader(inputSplit, taskAttemptContext); recordReader.initialize(inputSplit, taskAttemptContext); storage.prepareToRead(recordReader, null); storages.add(storage); } return storages; }