Example usage for org.apache.hadoop.mapreduce RecordReader initialize

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader initialize.

Prototype

public abstract void initialize(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException;

Source Link

Document

Called once at initialization.

Usage

From source file:org.apache.tinkerpop.gremlin.hadoop.structure.io.gryo.GryoInputFormat.java

License:Apache License

@Override
public RecordReader<NullWritable, VertexWritable> createRecordReader(final InputSplit split,
        final TaskAttemptContext context) throws IOException, InterruptedException {
    final RecordReader<NullWritable, VertexWritable> reader = new GryoRecordReader();
    reader.initialize(split, context);
    return reader;
}

From source file:org.apache.tinkerpop.gremlin.hadoop.structure.io.script.ScriptInputFormat.java

License:Apache License

@Override
public RecordReader<NullWritable, VertexWritable> createRecordReader(final InputSplit split,
        final TaskAttemptContext context) throws IOException, InterruptedException {
    RecordReader<NullWritable, VertexWritable> reader = new ScriptRecordReader();
    reader.initialize(split, context);
    return reader;
}

From source file:org.apache.vxquery.hdfs2.HDFSFunctions.java

License:Apache License

public RecordReader getReader() {

    List<FileSplit> fileSplits = new ArrayList<FileSplit>();
    for (int i = 0; i < splits.size(); i++) {
        fileSplits.add((FileSplit) splits.get(i));
    }/*w  ww . j a v  a  2  s .c  om*/
    FileSplitsFactory splitsFactory;
    try {
        splitsFactory = new FileSplitsFactory(fileSplits);
        List<FileSplit> inputSplits = splitsFactory.getSplits();
        ContextFactory ctxFactory = new ContextFactory();
        int size = inputSplits.size();
        for (int i = 0; i < size; i++) {
            /**
             * read the split
             */
            TaskAttemptContext context;
            try {
                context = ctxFactory.createContext(job.getConfiguration(), i);
                RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context);
                reader.initialize(inputSplits.get(i), context);
                return reader;
            } catch (HyracksDataException e) {
                if (LOGGER.isLoggable(Level.SEVERE)) {
                    LOGGER.severe(e.getMessage());
                }
            } catch (IOException e) {
                if (LOGGER.isLoggable(Level.SEVERE)) {
                    LOGGER.severe(e.getMessage());
                }
            } catch (InterruptedException e) {
                if (LOGGER.isLoggable(Level.SEVERE)) {
                    LOGGER.severe(e.getMessage());
                }
            }
        }
    } catch (HyracksDataException e) {
        if (LOGGER.isLoggable(Level.SEVERE)) {
            LOGGER.severe(e.getMessage());
        }
    }
    return null;
}

From source file:org.apache.vxquery.metadata.VXQueryCollectionOperatorDescriptor.java

License:Apache License

@Override
public IOperatorNodePushable createPushRuntime(IHyracksTaskContext ctx,
        IRecordDescriptorProvider recordDescProvider, int partition, int nPartitions)
        throws HyracksDataException {
    final FrameTupleAccessor fta = new FrameTupleAccessor(ctx.getFrameSize(),
            recordDescProvider.getInputRecordDescriptor(getActivityId(), 0));
    final int fieldOutputCount = recordDescProvider.getOutputRecordDescriptor(getActivityId(), 0)
            .getFieldCount();// w w w .  j a v a 2s  . co  m
    final ByteBuffer frame = ctx.allocateFrame();
    final FrameTupleAppender appender = new FrameTupleAppender(ctx.getFrameSize(), fieldOutputCount);
    final short partitionId = (short) ctx.getTaskAttemptId().getTaskId().getPartition();
    final ITreeNodeIdProvider nodeIdProvider = new TreeNodeIdProvider(partitionId, dataSourceId,
            totalDataSources);
    final String nodeId = ctx.getJobletContext().getApplicationContext().getNodeId();
    final DynamicContext dCtx = (DynamicContext) ctx.getJobletContext().getGlobalJobData();

    final String collectionName = collectionPartitions[partition % collectionPartitions.length];
    final XMLParser parser = new XMLParser(false, nodeIdProvider, nodeId, frame, appender, childSeq,
            dCtx.getStaticContext());

    return new AbstractUnaryInputUnaryOutputOperatorNodePushable() {
        @Override
        public void open() throws HyracksDataException {
            appender.reset(frame, true);
            writer.open();
            hdfs = new HDFSFunctions();
        }

        @Override
        public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
            fta.reset(buffer);
            String collectionModifiedName = collectionName.replace("${nodeId}", nodeId);
            if (!collectionModifiedName.contains("hdfs:/")) {
                File collectionDirectory = new File(collectionModifiedName);
                //check if directory is in the local file system
                if (collectionDirectory.exists()) {
                    // Go through each tuple.
                    if (collectionDirectory.isDirectory()) {
                        for (int tupleIndex = 0; tupleIndex < fta.getTupleCount(); ++tupleIndex) {
                            Iterator<File> it = FileUtils.iterateFiles(collectionDirectory,
                                    new VXQueryIOFileFilter(), TrueFileFilter.INSTANCE);
                            while (it.hasNext()) {
                                File xmlDocument = it.next();
                                if (LOGGER.isLoggable(Level.FINE)) {
                                    LOGGER.fine(
                                            "Starting to read XML document: " + xmlDocument.getAbsolutePath());
                                }
                                parser.parseElements(xmlDocument, writer, fta, tupleIndex);
                            }
                        }
                    } else {
                        throw new HyracksDataException("Invalid directory parameter (" + nodeId + ":"
                                + collectionDirectory.getAbsolutePath() + ") passed to collection.");
                    }
                }
            } else {
                // Else check in HDFS file system
                // Get instance of the HDFS filesystem
                FileSystem fs = hdfs.getFileSystem();
                if (fs != null) {
                    collectionModifiedName = collectionModifiedName.replaceAll("hdfs:/", "");
                    Path directory = new Path(collectionModifiedName);
                    Path xmlDocument;
                    if (tag != null) {
                        hdfs.setJob(directory.getName(), tag);
                        tag = "<" + tag + ">";
                        Job job = hdfs.getJob();
                        InputFormat inputFormat = hdfs.getinputFormat();
                        try {
                            hdfs.scheduleSplits();
                            ArrayList<Integer> schedule = hdfs
                                    .getScheduleForNode(InetAddress.getLocalHost().getHostName());
                            List<InputSplit> splits = hdfs.getSplits();
                            List<FileSplit> fileSplits = new ArrayList<FileSplit>();
                            for (int i : schedule) {
                                fileSplits.add((FileSplit) splits.get(i));
                            }
                            FileSplitsFactory splitsFactory = new FileSplitsFactory(fileSplits);
                            List<FileSplit> inputSplits = splitsFactory.getSplits();
                            ContextFactory ctxFactory = new ContextFactory();
                            int size = inputSplits.size();
                            InputStream stream;
                            String value;
                            RecordReader reader;
                            TaskAttemptContext context;
                            for (int i = 0; i < size; i++) {
                                //read split
                                context = ctxFactory.createContext(job.getConfiguration(), i);
                                try {
                                    reader = inputFormat.createRecordReader(inputSplits.get(i), context);
                                    reader.initialize(inputSplits.get(i), context);
                                    while (reader.nextKeyValue()) {
                                        value = reader.getCurrentValue().toString();
                                        //Split value if it contains more than one item with the tag
                                        if (StringUtils.countMatches(value, tag) > 1) {
                                            String items[] = value.split(tag);
                                            for (String item : items) {
                                                if (item.length() > 0) {
                                                    item = START_TAG + tag + item;
                                                    stream = new ByteArrayInputStream(
                                                            item.getBytes(StandardCharsets.UTF_8));
                                                    parser.parseHDFSElements(stream, writer, fta, i);
                                                }
                                            }
                                        } else {
                                            value = START_TAG + value;
                                            //create an input stream to the file currently reading and send it to parser
                                            stream = new ByteArrayInputStream(
                                                    value.getBytes(StandardCharsets.UTF_8));
                                            parser.parseHDFSElements(stream, writer, fta, i);
                                        }
                                    }

                                } catch (InterruptedException e) {
                                    if (LOGGER.isLoggable(Level.SEVERE)) {
                                        LOGGER.severe(e.getMessage());
                                    }
                                }
                            }

                        } catch (IOException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        } catch (ParserConfigurationException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        } catch (SAXException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        }
                    } else {
                        try {
                            //check if the path exists and is a directory
                            if (fs.exists(directory) && fs.isDirectory(directory)) {
                                for (int tupleIndex = 0; tupleIndex < fta.getTupleCount(); ++tupleIndex) {
                                    //read every file in the directory
                                    RemoteIterator<LocatedFileStatus> it = fs.listFiles(directory, true);
                                    while (it.hasNext()) {
                                        xmlDocument = it.next().getPath();
                                        if (fs.isFile(xmlDocument)) {
                                            if (LOGGER.isLoggable(Level.FINE)) {
                                                LOGGER.fine("Starting to read XML document: "
                                                        + xmlDocument.getName());
                                            }
                                            //create an input stream to the file currently reading and send it to parser
                                            InputStream in = fs.open(xmlDocument).getWrappedStream();
                                            parser.parseHDFSElements(in, writer, fta, tupleIndex);
                                        }
                                    }
                                }
                            } else {
                                throw new HyracksDataException("Invalid HDFS directory parameter (" + nodeId
                                        + ":" + directory + ") passed to collection.");
                            }
                        } catch (FileNotFoundException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        } catch (IOException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        }
                    }
                    try {
                        fs.close();
                    } catch (IOException e) {
                        if (LOGGER.isLoggable(Level.SEVERE)) {
                            LOGGER.severe(e.getMessage());
                        }
                    }
                }
            }
        }

        @Override
        public void fail() throws HyracksDataException {
            writer.fail();
        }

        @Override
        public void close() throws HyracksDataException {
            // Check if needed?
            fta.reset(frame);
            if (fta.getTupleCount() > 0) {
                FrameUtils.flushFrame(frame, writer);
            }
            writer.close();
        }
    };
}

From source file:org.bgi.flexlab.gaea.data.mapreduce.input.bam.GaeaBamInputFormat.java

License:Open Source License

public RecordReader<LongWritable, SamRecordWritable> createRecordReader(InputSplit split,
        TaskAttemptContext ctx) throws InterruptedException, IOException {
    RecordReader<LongWritable, SamRecordWritable> rr = new GaeaBamRecordReader();
    Configuration conf = ctx.getConfiguration();
    DEBUG_BAM_SPLITTER = conf.getBoolean("debug.bam.splitter", false);
    rr.initialize(split, ctx);
    return rr;/*from w ww.  j ava  2s .  c  om*/
}

From source file:org.bgi.flexlab.gaea.data.mapreduce.input.sam.GaeaSamInputFormat.java

License:Open Source License

@Override
public RecordReader<LongWritable, SamRecordWritable> createRecordReader(InputSplit split,
        TaskAttemptContext ctx) throws InterruptedException, IOException {
    final RecordReader<LongWritable, SamRecordWritable> rr = new GaeaSamRecordReader();
    rr.initialize(split, ctx);
    return rr;//  ww w  . ja v  a2 s. co m
}

From source file:org.gridgain.grid.kernal.processors.hadoop.v2.GridHadoopV2MapTask.java

License:Open Source License

/** {@inheritDoc} */
@SuppressWarnings({ "ConstantConditions", "unchecked" })
@Override//from w  w w  .ja  va 2  s  .  c o m
public void run0(GridHadoopV2TaskContext taskCtx) throws GridException {
    GridHadoopInputSplit split = info().inputSplit();

    InputSplit nativeSplit;

    if (split instanceof GridHadoopFileBlock) {
        GridHadoopFileBlock block = (GridHadoopFileBlock) split;

        nativeSplit = new FileSplit(new Path(block.file().toString()), block.start(), block.length(), null);
    } else
        nativeSplit = (InputSplit) taskCtx.getNativeSplit(split);

    assert nativeSplit != null;

    OutputFormat outputFormat = null;
    Exception err = null;

    JobContextImpl jobCtx = taskCtx.jobContext();

    try {
        InputFormat inFormat = ReflectionUtils.newInstance(jobCtx.getInputFormatClass(),
                hadoopContext().getConfiguration());

        RecordReader reader = inFormat.createRecordReader(nativeSplit, hadoopContext());

        reader.initialize(nativeSplit, hadoopContext());

        hadoopContext().reader(reader);

        GridHadoopJobInfo jobInfo = taskCtx.job().info();

        outputFormat = jobInfo.hasCombiner() || jobInfo.hasReducer() ? null : prepareWriter(jobCtx);

        Mapper mapper = ReflectionUtils.newInstance(jobCtx.getMapperClass(),
                hadoopContext().getConfiguration());

        try {
            mapper.run(new WrappedMapper().getMapContext(hadoopContext()));
        } finally {
            closeWriter();
        }

        commit(outputFormat);
    } catch (InterruptedException e) {
        err = e;

        Thread.currentThread().interrupt();

        throw new GridInterruptedException(e);
    } catch (Exception e) {
        err = e;

        throw new GridException(e);
    } finally {
        if (err != null)
            abort(outputFormat);
    }
}

From source file:org.kududb.mapreduce.TestKuduTableInputFormat.java

License:Apache License

private RecordReader<NullWritable, RowResult> createRecordReader(String columnProjection,
        List<ColumnRangePredicate> predicates) throws IOException, InterruptedException {
    KuduTableInputFormat input = new KuduTableInputFormat();
    Configuration conf = new Configuration();
    conf.set(KuduTableInputFormat.MASTER_ADDRESSES_KEY, getMasterAddresses());
    conf.set(KuduTableInputFormat.INPUT_TABLE_KEY, TABLE_NAME);
    if (columnProjection != null) {
        conf.set(KuduTableInputFormat.COLUMN_PROJECTION_KEY, columnProjection);
    }//from  www . ja v  a 2s .c  o m
    if (predicates != null) {
        String encodedPredicates = KuduTableMapReduceUtil.base64EncodePredicates(predicates);
        conf.set(KuduTableInputFormat.ENCODED_COLUMN_RANGE_PREDICATES_KEY, encodedPredicates);
    }
    input.setConf(conf);
    List<InputSplit> splits = input.getSplits(null);

    // We need to re-create the input format to reconnect the client.
    input = new KuduTableInputFormat();
    input.setConf(conf);
    RecordReader<NullWritable, RowResult> reader = input.createRecordReader(null, null);
    reader.initialize(Iterables.getOnlyElement(splits), null);
    return reader;
}

From source file:org.mrgeo.data.MrsPyramidRecordReader.java

License:Apache License

private RecordReader<TileIdWritable, RasterWritable> createRecordReader(final MrsPyramidInputSplit split,
        final TaskAttemptContext context) throws IOException {
    InputSplit initializeWithSplit;//from  w w w  . j a va2s  . c o m
    // The record reader needs the native split returned from
    // the data plugin.
    RecordReader<TileIdWritable, RasterWritable> recordReader = getRecordReader(split.getName(),
            context.getConfiguration());
    initializeWithSplit = split.getWrappedSplit();

    try {
        recordReader.initialize(initializeWithSplit, context);
    } catch (Exception e) {
        throw new IOException(e);
    }
    return recordReader;
}

From source file:org.mrgeo.data.MrsPyramidSimpleRecordReader.java

License:Apache License

private RecordReader<TileIdWritable, TWritable> createRecordReader(final MrsPyramidInputSplit split,
        final TaskAttemptContext context) throws DataProviderNotFound, IOException {
    InputSplit initializeWithSplit = null;
    RecordReader<TileIdWritable, TWritable> recordReader = null;
    if (ifContext.getIncludeEmptyTiles()) {
        if (split.getWrappedSplit().getWrappedSplit() == null) {
            recordReader = new AllBlankTilesRecordReader();
        } else {/*ww  w  .  jav  a  2  s .c o  m*/
            recordReader = new AllTilesRecordReader();
        }
        // The all tiles record readers need the MrsPyramidInputSplit which
        // wraps the native split returned from the data plugin.
        initializeWithSplit = split;
    } else {
        // The standard record reader needs the native split returned from
        // the data plugin.
        recordReader = getRecordReader(split.getName(), context.getConfiguration());
        initializeWithSplit = split.getWrappedSplit();
    }
    try {
        recordReader.initialize(initializeWithSplit, context);
    } catch (Throwable t) {
        throw new IOException(t);
    }
    return recordReader;
}