List of usage examples for org.apache.hadoop.mapreduce InputFormat createRecordReader
public abstract RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException;
From source file:org.apache.tinkerpop.gremlin.hadoop.structure.hdfs.HadoopElementIterator.java
License:Apache License
public HadoopElementIterator(final HadoopGraph graph) throws IOException { try {/*from w w w. j a v a2 s.c o m*/ this.graph = graph; if (this.graph.configuration().containsKey(Constants.GREMLIN_HADOOP_INPUT_LOCATION)) { final Configuration configuration = ConfUtil.makeHadoopConfiguration(this.graph.configuration()); final InputFormat<NullWritable, VertexWritable> inputFormat = this.graph.configuration() .getGraphInputFormat().getConstructor().newInstance(); for (final FileStatus status : FileSystem.get(configuration).listStatus( new Path(graph.configuration().getInputLocation()), HiddenFileFilter.instance())) { this.readers.add(inputFormat.createRecordReader( new FileSplit(status.getPath(), 0, Integer.MAX_VALUE, new String[] {}), new TaskAttemptContext(configuration, new TaskAttemptID()))); } } } catch (Exception e) { throw new IllegalStateException(e.getMessage(), e); } }
From source file:org.apache.tinkerpop.gremlin.hadoop.structure.io.HadoopElementIterator.java
License:Apache License
public HadoopElementIterator(final HadoopGraph graph) { try {//from w w w . ja v a 2 s . co m this.graph = graph; final Configuration configuration = ConfUtil.makeHadoopConfiguration(this.graph.configuration()); final InputFormat<NullWritable, VertexWritable> inputFormat = ConfUtil .getReaderAsInputFormat(configuration); if (inputFormat instanceof FileInputFormat) { final Storage storage = FileSystemStorage.open(configuration); if (!this.graph.configuration().containsKey(Constants.GREMLIN_HADOOP_INPUT_LOCATION)) return; // there is no input location and thus, no data (empty graph) if (!Constants.getSearchGraphLocation(this.graph.configuration().getInputLocation(), storage) .isPresent()) return; // there is no data at the input location (empty graph) configuration.set(Constants.MAPREDUCE_INPUT_FILEINPUTFORMAT_INPUTDIR, Constants .getSearchGraphLocation(this.graph.configuration().getInputLocation(), storage).get()); } final List<InputSplit> splits = inputFormat .getSplits(new JobContextImpl(configuration, new JobID(UUID.randomUUID().toString(), 1))); for (final InputSplit split : splits) { this.readers.add(inputFormat.createRecordReader(split, new TaskAttemptContextImpl(configuration, new TaskAttemptID()))); } } catch (final Exception e) { throw new IllegalStateException(e.getMessage(), e); } }
From source file:org.apache.tinkerpop.gremlin.hadoop.structure.io.RecordReaderWriterTest.java
License:Apache License
private static void validateFileSplits(final List<FileSplit> fileSplits, final Configuration configuration, final Class<? extends InputFormat<NullWritable, VertexWritable>> inputFormatClass, final Optional<Class<? extends OutputFormat<NullWritable, VertexWritable>>> outFormatClass) throws Exception { final InputFormat inputFormat = ReflectionUtils.newInstance(inputFormatClass, configuration); final TaskAttemptContext job = new TaskAttemptContextImpl(configuration, new TaskAttemptID(UUID.randomUUID().toString(), 0, TaskType.MAP, 0, 0)); int vertexCount = 0; int outEdgeCount = 0; int inEdgeCount = 0; final OutputFormat<NullWritable, VertexWritable> outputFormat = outFormatClass.isPresent() ? ReflectionUtils.newInstance(outFormatClass.get(), configuration) : null;/*from ww w.j a v a2 s . co m*/ final RecordWriter<NullWritable, VertexWritable> writer = null == outputFormat ? null : outputFormat.getRecordWriter(job); boolean foundKeyValue = false; for (final FileSplit split : fileSplits) { logger.info("\treading file split {}", split.getPath().getName() + " ({}", split.getStart() + "..." + (split.getStart() + split.getLength()), "{} {} bytes)"); final RecordReader reader = inputFormat.createRecordReader(split, job); float lastProgress = -1f; while (reader.nextKeyValue()) { //System.out.println("" + reader.getProgress() + "> " + reader.getCurrentKey() + ": " + reader.getCurrentValue()); final float progress = reader.getProgress(); assertTrue(progress >= lastProgress); assertEquals(NullWritable.class, reader.getCurrentKey().getClass()); final VertexWritable vertexWritable = (VertexWritable) reader.getCurrentValue(); if (null != writer) writer.write(NullWritable.get(), vertexWritable); vertexCount++; outEdgeCount = outEdgeCount + (int) IteratorUtils.count(vertexWritable.get().edges(Direction.OUT)); inEdgeCount = inEdgeCount + (int) IteratorUtils.count(vertexWritable.get().edges(Direction.IN)); // final Vertex vertex = vertexWritable.get(); assertEquals(Integer.class, vertex.id().getClass()); if (vertex.value("name").equals("SUGAR MAGNOLIA")) { foundKeyValue = true; assertEquals(92, IteratorUtils.count(vertex.edges(Direction.OUT))); assertEquals(77, IteratorUtils.count(vertex.edges(Direction.IN))); } lastProgress = progress; } } assertEquals(8049, outEdgeCount); assertEquals(8049, inEdgeCount); assertEquals(outEdgeCount, inEdgeCount); assertEquals(808, vertexCount); assertTrue(foundKeyValue); if (null != writer) { writer.close(new TaskAttemptContextImpl(configuration, job.getTaskAttemptID())); for (int i = 1; i < 10; i++) { final File outputDirectory = new File( new URL(configuration.get("mapreduce.output.fileoutputformat.outputdir")).toURI()); final List<FileSplit> splits = generateFileSplits( new File(outputDirectory.getAbsoluteFile() + "/_temporary/0/_temporary/" + job.getTaskAttemptID().getTaskID().toString().replace("task", "attempt") + "_0" + "/part-m-00000"), i); validateFileSplits(splits, configuration, inputFormatClass, Optional.empty()); } } }
From source file:org.apache.vxquery.metadata.VXQueryCollectionOperatorDescriptor.java
License:Apache License
@Override public IOperatorNodePushable createPushRuntime(IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, int partition, int nPartitions) throws HyracksDataException { final FrameTupleAccessor fta = new FrameTupleAccessor(ctx.getFrameSize(), recordDescProvider.getInputRecordDescriptor(getActivityId(), 0)); final int fieldOutputCount = recordDescProvider.getOutputRecordDescriptor(getActivityId(), 0) .getFieldCount();/*ww w.j a v a2 s . co m*/ final ByteBuffer frame = ctx.allocateFrame(); final FrameTupleAppender appender = new FrameTupleAppender(ctx.getFrameSize(), fieldOutputCount); final short partitionId = (short) ctx.getTaskAttemptId().getTaskId().getPartition(); final ITreeNodeIdProvider nodeIdProvider = new TreeNodeIdProvider(partitionId, dataSourceId, totalDataSources); final String nodeId = ctx.getJobletContext().getApplicationContext().getNodeId(); final DynamicContext dCtx = (DynamicContext) ctx.getJobletContext().getGlobalJobData(); final String collectionName = collectionPartitions[partition % collectionPartitions.length]; final XMLParser parser = new XMLParser(false, nodeIdProvider, nodeId, frame, appender, childSeq, dCtx.getStaticContext()); return new AbstractUnaryInputUnaryOutputOperatorNodePushable() { @Override public void open() throws HyracksDataException { appender.reset(frame, true); writer.open(); hdfs = new HDFSFunctions(); } @Override public void nextFrame(ByteBuffer buffer) throws HyracksDataException { fta.reset(buffer); String collectionModifiedName = collectionName.replace("${nodeId}", nodeId); if (!collectionModifiedName.contains("hdfs:/")) { File collectionDirectory = new File(collectionModifiedName); //check if directory is in the local file system if (collectionDirectory.exists()) { // Go through each tuple. if (collectionDirectory.isDirectory()) { for (int tupleIndex = 0; tupleIndex < fta.getTupleCount(); ++tupleIndex) { Iterator<File> it = FileUtils.iterateFiles(collectionDirectory, new VXQueryIOFileFilter(), TrueFileFilter.INSTANCE); while (it.hasNext()) { File xmlDocument = it.next(); if (LOGGER.isLoggable(Level.FINE)) { LOGGER.fine( "Starting to read XML document: " + xmlDocument.getAbsolutePath()); } parser.parseElements(xmlDocument, writer, fta, tupleIndex); } } } else { throw new HyracksDataException("Invalid directory parameter (" + nodeId + ":" + collectionDirectory.getAbsolutePath() + ") passed to collection."); } } } else { // Else check in HDFS file system // Get instance of the HDFS filesystem FileSystem fs = hdfs.getFileSystem(); if (fs != null) { collectionModifiedName = collectionModifiedName.replaceAll("hdfs:/", ""); Path directory = new Path(collectionModifiedName); Path xmlDocument; if (tag != null) { hdfs.setJob(directory.getName(), tag); tag = "<" + tag + ">"; Job job = hdfs.getJob(); InputFormat inputFormat = hdfs.getinputFormat(); try { hdfs.scheduleSplits(); ArrayList<Integer> schedule = hdfs .getScheduleForNode(InetAddress.getLocalHost().getHostName()); List<InputSplit> splits = hdfs.getSplits(); List<FileSplit> fileSplits = new ArrayList<FileSplit>(); for (int i : schedule) { fileSplits.add((FileSplit) splits.get(i)); } FileSplitsFactory splitsFactory = new FileSplitsFactory(fileSplits); List<FileSplit> inputSplits = splitsFactory.getSplits(); ContextFactory ctxFactory = new ContextFactory(); int size = inputSplits.size(); InputStream stream; String value; RecordReader reader; TaskAttemptContext context; for (int i = 0; i < size; i++) { //read split context = ctxFactory.createContext(job.getConfiguration(), i); try { reader = inputFormat.createRecordReader(inputSplits.get(i), context); reader.initialize(inputSplits.get(i), context); while (reader.nextKeyValue()) { value = reader.getCurrentValue().toString(); //Split value if it contains more than one item with the tag if (StringUtils.countMatches(value, tag) > 1) { String items[] = value.split(tag); for (String item : items) { if (item.length() > 0) { item = START_TAG + tag + item; stream = new ByteArrayInputStream( item.getBytes(StandardCharsets.UTF_8)); parser.parseHDFSElements(stream, writer, fta, i); } } } else { value = START_TAG + value; //create an input stream to the file currently reading and send it to parser stream = new ByteArrayInputStream( value.getBytes(StandardCharsets.UTF_8)); parser.parseHDFSElements(stream, writer, fta, i); } } } catch (InterruptedException e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } } } catch (IOException e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } catch (ParserConfigurationException e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } catch (SAXException e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } } else { try { //check if the path exists and is a directory if (fs.exists(directory) && fs.isDirectory(directory)) { for (int tupleIndex = 0; tupleIndex < fta.getTupleCount(); ++tupleIndex) { //read every file in the directory RemoteIterator<LocatedFileStatus> it = fs.listFiles(directory, true); while (it.hasNext()) { xmlDocument = it.next().getPath(); if (fs.isFile(xmlDocument)) { if (LOGGER.isLoggable(Level.FINE)) { LOGGER.fine("Starting to read XML document: " + xmlDocument.getName()); } //create an input stream to the file currently reading and send it to parser InputStream in = fs.open(xmlDocument).getWrappedStream(); parser.parseHDFSElements(in, writer, fta, tupleIndex); } } } } else { throw new HyracksDataException("Invalid HDFS directory parameter (" + nodeId + ":" + directory + ") passed to collection."); } } catch (FileNotFoundException e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } catch (IOException e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } } try { fs.close(); } catch (IOException e) { if (LOGGER.isLoggable(Level.SEVERE)) { LOGGER.severe(e.getMessage()); } } } } } @Override public void fail() throws HyracksDataException { writer.fail(); } @Override public void close() throws HyracksDataException { // Check if needed? fta.reset(frame); if (fta.getTupleCount() > 0) { FrameUtils.flushFrame(frame, writer); } writer.close(); } }; }
From source file:org.goldenorb.io.input.VertexInput.java
License:Apache License
/** * *//*from w w w .j a v a 2 s . c om*/ @SuppressWarnings("unchecked") public void initialize() { // rebuild the input split org.apache.hadoop.mapreduce.InputSplit split = null; DataInputBuffer splitBuffer = new DataInputBuffer(); splitBuffer.reset(rawSplit.getBytes(), 0, rawSplit.getLength()); SerializationFactory factory = new SerializationFactory(orbConf); Deserializer<? extends org.apache.hadoop.mapreduce.InputSplit> deserializer; try { deserializer = (Deserializer<? extends org.apache.hadoop.mapreduce.InputSplit>) factory .getDeserializer(orbConf.getClassByName(splitClass)); deserializer.open(splitBuffer); split = deserializer.deserialize(null); JobConf job = new JobConf(orbConf); JobContext jobContext = new JobContext(job, new JobID(getOrbConf().getJobNumber(), 0)); InputFormat<INPUT_KEY, INPUT_VALUE> inputFormat; inputFormat = (InputFormat<INPUT_KEY, INPUT_VALUE>) ReflectionUtils .newInstance(jobContext.getInputFormatClass(), orbConf); TaskAttemptContext tao = new TaskAttemptContext(job, new TaskAttemptID(new TaskID(jobContext.getJobID(), true, partitionID), 0)); recordReader = inputFormat.createRecordReader(split, tao); recordReader.initialize(split, tao); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } catch (IOException e) { throw new RuntimeException(e); } catch (InterruptedException e) { throw new RuntimeException(e); } }
From source file:org.gridgain.grid.kernal.processors.hadoop.v2.GridHadoopV2MapTask.java
License:Open Source License
/** {@inheritDoc} */ @SuppressWarnings({ "ConstantConditions", "unchecked" }) @Override// w ww. ja va 2 s .co m public void run0(GridHadoopV2TaskContext taskCtx) throws GridException { GridHadoopInputSplit split = info().inputSplit(); InputSplit nativeSplit; if (split instanceof GridHadoopFileBlock) { GridHadoopFileBlock block = (GridHadoopFileBlock) split; nativeSplit = new FileSplit(new Path(block.file().toString()), block.start(), block.length(), null); } else nativeSplit = (InputSplit) taskCtx.getNativeSplit(split); assert nativeSplit != null; OutputFormat outputFormat = null; Exception err = null; JobContextImpl jobCtx = taskCtx.jobContext(); try { InputFormat inFormat = ReflectionUtils.newInstance(jobCtx.getInputFormatClass(), hadoopContext().getConfiguration()); RecordReader reader = inFormat.createRecordReader(nativeSplit, hadoopContext()); reader.initialize(nativeSplit, hadoopContext()); hadoopContext().reader(reader); GridHadoopJobInfo jobInfo = taskCtx.job().info(); outputFormat = jobInfo.hasCombiner() || jobInfo.hasReducer() ? null : prepareWriter(jobCtx); Mapper mapper = ReflectionUtils.newInstance(jobCtx.getMapperClass(), hadoopContext().getConfiguration()); try { mapper.run(new WrappedMapper().getMapContext(hadoopContext())); } finally { closeWriter(); } commit(outputFormat); } catch (InterruptedException e) { err = e; Thread.currentThread().interrupt(); throw new GridInterruptedException(e); } catch (Exception e) { err = e; throw new GridException(e); } finally { if (err != null) abort(outputFormat); } }
From source file:org.mrgeo.format.FilteredFeatureInputFormat.java
License:Apache License
@Override public RecordReader<LongWritable, Geometry> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { try {// www . j a v a 2 s. c o m InputFormat<LongWritable, Geometry> srcFormat = getInputFormat(context.getConfiguration()) .newInstance(); RecordReader<LongWritable, Geometry> srcReader = srcFormat.createRecordReader(split, context); return new FilteredRecordReader(srcReader, getFeatureFilter(context.getConfiguration())); } catch (Exception e) { throw new IOException(e); } }
From source file:org.tensorflow.hadoop.io.TFRecordFileTest.java
License:Open Source License
@Test public void testInputOutputFormat() throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf);//from ww w. j a v a 2 s .c o m Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "tfr-test"); TFRecordFileOutputFormat.setOutputPath(job, outdir); TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration()); OutputFormat<BytesWritable, NullWritable> outputFormat = new TFRecordFileOutputFormat(); OutputCommitter committer = outputFormat.getOutputCommitter(context); committer.setupJob(job); RecordWriter<BytesWritable, NullWritable> writer = outputFormat.getRecordWriter(context); // Write Example with random numbers Random rand = new Random(); Map<Long, Long> records = new TreeMap<Long, Long>(); try { for (int i = 0; i < RECORDS; ++i) { long randValue = rand.nextLong(); records.put((long) i, randValue); Int64List data = Int64List.newBuilder().addValue(i).addValue(randValue).build(); Feature feature = Feature.newBuilder().setInt64List(data).build(); Features features = Features.newBuilder().putFeature("data", feature).build(); Example example = Example.newBuilder().setFeatures(features).build(); BytesWritable key = new BytesWritable(example.toByteArray()); writer.write(key, NullWritable.get()); } } finally { writer.close(context); } committer.commitTask(context); committer.commitJob(job); // Read and compare TFRecordFileInputFormat.setInputPaths(job, outdir); InputFormat<BytesWritable, NullWritable> inputFormat = new TFRecordFileInputFormat(); for (InputSplit split : inputFormat.getSplits(job)) { RecordReader<BytesWritable, NullWritable> reader = inputFormat.createRecordReader(split, context); MapContext<BytesWritable, NullWritable, BytesWritable, NullWritable> mcontext = new MapContextImpl<BytesWritable, NullWritable, BytesWritable, NullWritable>( job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split); reader.initialize(split, mcontext); try { while (reader.nextKeyValue()) { BytesWritable bytes = reader.getCurrentKey(); Example example = Example.parseFrom(bytes.getBytes()); Int64List data = example.getFeatures().getFeatureMap().get("data").getInt64List(); Long key = data.getValue(0); Long value = data.getValue(1); assertEquals(records.get(key), value); records.remove(key); } } finally { reader.close(); } } assertEquals(0, records.size()); }
From source file:org.warcbase.io.GenericArchiveRecordWritableTest.java
License:Apache License
@Test public void testArcInputFormat() throws Exception { String arcFile = Resources.getResource("arc/example.arc.gz").getPath(); Configuration conf = new Configuration(false); conf.set("fs.defaultFS", "file:///"); File testFile = new File(arcFile); Path path = new Path(testFile.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat<LongWritable, GenericArchiveRecordWritable> inputFormat = ReflectionUtils .newInstance(WacGenericInputFormat.class, conf); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); RecordReader<LongWritable, GenericArchiveRecordWritable> reader = inputFormat.createRecordReader(split, context);//from w w w .java2 s. c o m reader.initialize(split, context); int cnt = 0; while (reader.nextKeyValue()) { GenericArchiveRecordWritable record = reader.getCurrentValue(); cnt++; ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); DataOutputStream dataOut = new DataOutputStream(bytesOut); record.write(dataOut); GenericArchiveRecordWritable reconstructed = new GenericArchiveRecordWritable(); reconstructed.setFormat(ArchiveFormat.ARC); reconstructed.readFields(new DataInputStream(new ByteArrayInputStream(bytesOut.toByteArray()))); boolean isArc = (record.getFormat() == ArchiveFormat.ARC); assertEquals(isArc, true); if (isArc) { assertEquals(((ARCRecord) record.getRecord()).getMetaData().getUrl(), ((ARCRecord) reconstructed.getRecord()).getMetaData().getUrl()); } } assertEquals(300, cnt); }
From source file:org.warcbase.io.GenericArchiveRecordWritableTest.java
License:Apache License
@Test public void testWarcInputFormat() throws Exception { String warcFile = Resources.getResource("warc/example.warc.gz").getPath(); Configuration conf = new Configuration(false); conf.set("fs.defaultFS", "file:///"); File testFile = new File(warcFile); Path path = new Path(testFile.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat<LongWritable, GenericArchiveRecordWritable> inputFormat = ReflectionUtils .newInstance(WacGenericInputFormat.class, conf); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); RecordReader<LongWritable, GenericArchiveRecordWritable> reader = inputFormat.createRecordReader(split, context);//from w ww. j a v a 2 s . c om reader.initialize(split, context); int cnt = 0; while (reader.nextKeyValue()) { GenericArchiveRecordWritable record = reader.getCurrentValue(); cnt++; ByteArrayOutputStream bytesOut = new ByteArrayOutputStream(); DataOutputStream dataOut = new DataOutputStream(bytesOut); record.write(dataOut); GenericArchiveRecordWritable reconstructed = new GenericArchiveRecordWritable(); reconstructed.setFormat(ArchiveFormat.WARC); reconstructed.readFields(new DataInputStream(new ByteArrayInputStream(bytesOut.toByteArray()))); boolean isWarc = (record.getFormat() == ArchiveFormat.WARC); assertTrue(isWarc); if (isWarc) { assertEquals(record.getRecord().getHeader().getUrl(), reconstructed.getRecord().getHeader().getUrl()); assertEquals(record.getRecord().getHeader().getContentLength(), reconstructed.getRecord().getHeader().getContentLength()); } } assertEquals(822, cnt); }