Example usage for org.apache.hadoop.mapreduce RecordReader getCurrentValue

List of usage examples for org.apache.hadoop.mapreduce RecordReader getCurrentValue

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader getCurrentValue.

Prototype

public abstract VALUEIN getCurrentValue() throws IOException, InterruptedException;

Source Link

Document

Get the current value.

Usage

From source file:org.apache.mnemonic.mapreduce.MneMapreduceBufferDataTest.java

License:Apache License

@Test(enabled = true, dependsOnMethods = { "testWriteBufferData" })
public void testReadBufferData() throws Exception {
    long reccnt = 0L;
    long tsize = 0L;
    byte[] buf;//from  ww w .j  a  v  a 2s .co  m
    Checksum cs = new CRC32();
    cs.reset();
    File folder = new File(m_workdir.toString());
    File[] listfiles = folder.listFiles();
    for (int idx = 0; idx < listfiles.length; ++idx) {
        if (listfiles[idx].isFile()
                && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
                && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
            m_partfns.add(listfiles[idx].getName());
        }
    }
    Collections.sort(m_partfns); // keep the order for checksum
    for (int idx = 0; idx < m_partfns.size(); ++idx) {
        System.out.println(String.format("Verifying : %s", m_partfns.get(idx)));
        FileSplit split = new FileSplit(new Path(m_workdir, m_partfns.get(idx)), 0, 0L, new String[0]);
        InputFormat<NullWritable, MneDurableInputValue<DurableBuffer<?>>> inputFormat = new MneInputFormat<MneDurableInputValue<DurableBuffer<?>>, DurableBuffer<?>>();
        RecordReader<NullWritable, MneDurableInputValue<DurableBuffer<?>>> reader = inputFormat
                .createRecordReader(split, m_tacontext);
        MneDurableInputValue<DurableBuffer<?>> dbufval = null;
        while (reader.nextKeyValue()) {
            dbufval = reader.getCurrentValue();
            assert dbufval.getValue().getSize() == dbufval.getValue().get().capacity();
            dbufval.getValue().get().clear();
            buf = new byte[dbufval.getValue().get().capacity()];
            dbufval.getValue().get().get(buf);
            cs.update(buf, 0, buf.length);
            tsize += dbufval.getValue().getSize();
            ++reccnt;
        }
        reader.close();
    }
    AssertJUnit.assertEquals(m_reccnt, reccnt);
    AssertJUnit.assertEquals(m_totalsize, tsize);
    AssertJUnit.assertEquals(m_checksum, cs.getValue());
    System.out.println(String.format("The checksum of buffer is %d", m_checksum));
}

From source file:org.apache.mnemonic.mapreduce.MneMapreduceChunkDataTest.java

License:Apache License

@Test(enabled = true, dependsOnMethods = { "testWriteChunkData" })
public void testReadChunkData() throws Exception {
    List<String> partfns = new ArrayList<String>();
    long reccnt = 0L;
    long tsize = 0L;
    Checksum cs = new CRC32();
    cs.reset();/*  w ww.j  ava2s  .  c  om*/
    File folder = new File(m_workdir.toString());
    File[] listfiles = folder.listFiles();
    for (int idx = 0; idx < listfiles.length; ++idx) {
        if (listfiles[idx].isFile()
                && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
                && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
            partfns.add(listfiles[idx].getName());
        }
    }
    Collections.sort(partfns); // keep the order for checksum
    for (int idx = 0; idx < partfns.size(); ++idx) {
        System.out.println(String.format("Verifying : %s", partfns.get(idx)));
        FileSplit split = new FileSplit(new Path(m_workdir, partfns.get(idx)), 0, 0L, new String[0]);
        InputFormat<NullWritable, MneDurableInputValue<DurableChunk<?>>> inputFormat = new MneInputFormat<MneDurableInputValue<DurableChunk<?>>, DurableChunk<?>>();
        RecordReader<NullWritable, MneDurableInputValue<DurableChunk<?>>> reader = inputFormat
                .createRecordReader(split, m_tacontext);
        MneDurableInputValue<DurableChunk<?>> dchkval = null;
        while (reader.nextKeyValue()) {
            dchkval = reader.getCurrentValue();
            byte b;
            for (int j = 0; j < dchkval.getValue().getSize(); ++j) {
                b = unsafe.getByte(dchkval.getValue().get() + j);
                cs.update(b);
            }
            tsize += dchkval.getValue().getSize();
            ++reccnt;
        }
        reader.close();
    }
    AssertJUnit.assertEquals(m_reccnt, reccnt);
    AssertJUnit.assertEquals(m_totalsize, tsize);
    AssertJUnit.assertEquals(m_checksum, cs.getValue());
    System.out.println(String.format("The checksum of chunk is %d", m_checksum));
}

From source file:org.apache.mnemonic.mapreduce.MneMapreduceLongDataTest.java

License:Apache License

@Test(enabled = true, dependsOnMethods = { "testWriteLongData" })
public void testReadLongData() throws Exception {
    long sum = 0L;
    long reccnt = 0L;
    File folder = new File(m_workdir.toString());
    File[] listfiles = folder.listFiles();
    for (int idx = 0; idx < listfiles.length; ++idx) {
        if (listfiles[idx].isFile()
                && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
                && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
            System.out.println(String.format("Verifying : %s", listfiles[idx].getName()));
            FileSplit split = new FileSplit(new Path(m_workdir, listfiles[idx].getName()), 0, 0L,
                    new String[0]);
            InputFormat<NullWritable, MneDurableInputValue<Long>> inputFormat = new MneInputFormat<MneDurableInputValue<Long>, Long>();
            RecordReader<NullWritable, MneDurableInputValue<Long>> reader = inputFormat
                    .createRecordReader(split, m_tacontext);
            MneDurableInputValue<Long> mdval = null;
            while (reader.nextKeyValue()) {
                mdval = reader.getCurrentValue();
                sum += mdval.getValue();
                ++reccnt;/* w w  w.ja  v a 2s . c o m*/
            }
            reader.close();
        }
    }
    AssertJUnit.assertEquals(m_sum, sum);
    AssertJUnit.assertEquals(m_reccnt, reccnt);
    System.out.println(String.format("The checksum of long data is %d", sum));
}

From source file:org.apache.mnemonic.mapreduce.MneMapreducePersonDataTest.java

License:Apache License

@Test(enabled = true, dependsOnMethods = { "testWritePersonData" })
public void testReadPersonData() throws Exception {
    long sumage = 0L;
    long reccnt = 0L;
    File folder = new File(m_workdir.toString());
    File[] listfiles = folder.listFiles();
    for (int idx = 0; idx < listfiles.length; ++idx) {
        if (listfiles[idx].isFile()
                && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
                && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
            System.out.println(String.format("Verifying : %s", listfiles[idx].getName()));
            FileSplit split = new FileSplit(new Path(m_workdir, listfiles[idx].getName()), 0, 0L,
                    new String[0]);
            InputFormat<NullWritable, MneDurableInputValue<Person<Long>>> inputFormat = new MneInputFormat<MneDurableInputValue<Person<Long>>, Person<Long>>();
            RecordReader<NullWritable, MneDurableInputValue<Person<Long>>> reader = inputFormat
                    .createRecordReader(split, m_tacontext);
            MneDurableInputValue<Person<Long>> personval = null;
            while (reader.nextKeyValue()) {
                personval = reader.getCurrentValue();
                AssertJUnit.assertTrue(personval.getValue().getAge() < 51);
                sumage += personval.getValue().getAge();
                ++reccnt;/*from   w  ww  . j  a va2s .c o m*/
            }
            reader.close();
        }
    }
    AssertJUnit.assertEquals(m_reccnt, reccnt);
    AssertJUnit.assertEquals(m_sumage, sumage);
    System.out.println(String.format("The checksum of ages is %d", sumage));
}

From source file:org.apache.orc.mapreduce.TestMapreduceOrcOutputFormat.java

License:Apache License

@Test
public void testPredicatePushdown() throws Exception {
    TaskAttemptID id = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
    TaskAttemptContext attemptContext = new TaskAttemptContextImpl(conf, id);
    final String typeStr = "struct<i:int,s:string>";
    OrcConf.MAPRED_OUTPUT_SCHEMA.setString(conf, typeStr);
    conf.set("mapreduce.output.fileoutputformat.outputdir", workDir.toString());
    conf.setInt(OrcConf.ROW_INDEX_STRIDE.getAttribute(), 1000);
    conf.setBoolean(OrcOutputFormat.SKIP_TEMP_DIRECTORY, true);
    OutputFormat<NullWritable, OrcStruct> outputFormat = new OrcOutputFormat<OrcStruct>();
    RecordWriter<NullWritable, OrcStruct> writer = outputFormat.getRecordWriter(attemptContext);

    // write 4000 rows with the integer and the binary string
    TypeDescription type = TypeDescription.fromString(typeStr);
    OrcStruct row = (OrcStruct) OrcStruct.createValue(type);
    NullWritable nada = NullWritable.get();
    for (int r = 0; r < 4000; ++r) {
        row.setFieldValue(0, new IntWritable(r));
        row.setFieldValue(1, new Text(Integer.toBinaryString(r)));
        writer.write(nada, row);/*from  w w w.j  a  va 2 s.  c  o m*/
    }
    writer.close(attemptContext);

    OrcInputFormat.setSearchArgument(conf,
            SearchArgumentFactory.newBuilder()
                    .between("i", PredicateLeaf.Type.LONG, new Long(1500), new Long(1999)).build(),
            new String[] { null, "i", "s" });
    FileSplit split = new FileSplit(new Path(workDir, "part-m-00000.orc"), 0, 1000000, new String[0]);
    RecordReader<NullWritable, OrcStruct> reader = new OrcInputFormat<OrcStruct>().createRecordReader(split,
            attemptContext);
    // the sarg should cause it to skip over the rows except 1000 to 2000
    for (int r = 1000; r < 2000; ++r) {
        assertEquals(true, reader.nextKeyValue());
        row = reader.getCurrentValue();
        assertEquals(r, ((IntWritable) row.getFieldValue(0)).get());
        assertEquals(Integer.toBinaryString(r), row.getFieldValue(1).toString());
    }
    assertEquals(false, reader.nextKeyValue());
}

From source file:org.apache.orc.mapreduce.TestMapreduceOrcOutputFormat.java

License:Apache License

@Test
public void testColumnSelection() throws Exception {
    String typeStr = "struct<i:int,j:int,k:int>";
    OrcConf.MAPRED_OUTPUT_SCHEMA.setString(conf, typeStr);
    conf.set("mapreduce.output.fileoutputformat.outputdir", workDir.toString());
    conf.setInt(OrcConf.ROW_INDEX_STRIDE.getAttribute(), 1000);
    conf.setBoolean(OrcOutputFormat.SKIP_TEMP_DIRECTORY, true);
    TaskAttemptID id = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 1);
    TaskAttemptContext attemptContext = new TaskAttemptContextImpl(conf, id);
    OutputFormat<NullWritable, OrcStruct> outputFormat = new OrcOutputFormat<OrcStruct>();
    RecordWriter<NullWritable, OrcStruct> writer = outputFormat.getRecordWriter(attemptContext);

    // write 4000 rows with the integer and the binary string
    TypeDescription type = TypeDescription.fromString(typeStr);
    OrcStruct row = (OrcStruct) OrcStruct.createValue(type);
    NullWritable nada = NullWritable.get();
    for (int r = 0; r < 3000; ++r) {
        row.setFieldValue(0, new IntWritable(r));
        row.setFieldValue(1, new IntWritable(r * 2));
        row.setFieldValue(2, new IntWritable(r * 3));
        writer.write(nada, row);//  ww w .j  a v  a  2s .c o m
    }
    writer.close(attemptContext);

    conf.set(OrcConf.INCLUDE_COLUMNS.getAttribute(), "0,2");
    FileSplit split = new FileSplit(new Path(workDir, "part-m-00000.orc"), 0, 1000000, new String[0]);
    RecordReader<NullWritable, OrcStruct> reader = new OrcInputFormat<OrcStruct>().createRecordReader(split,
            attemptContext);
    // the sarg should cause it to skip over the rows except 1000 to 2000
    for (int r = 0; r < 3000; ++r) {
        assertEquals(true, reader.nextKeyValue());
        row = reader.getCurrentValue();
        assertEquals(r, ((IntWritable) row.getFieldValue(0)).get());
        assertEquals(null, row.getFieldValue(1));
        assertEquals(r * 3, ((IntWritable) row.getFieldValue(2)).get());
    }
    assertEquals(false, reader.nextKeyValue());
}

From source file:org.apache.parquet.hadoop.thrift.TestParquetToThriftReadWriteAndProjection.java

License:Apache License

private <T extends TBase<?, ?>> void shouldDoProjection(Configuration conf, T recordToWrite,
        T exptectedReadResult, Class<? extends TBase<?, ?>> thriftClass) throws Exception {
    final Path parquetFile = new Path("target/test/TestParquetToThriftReadWriteAndProjection/file.parquet");
    final FileSystem fs = parquetFile.getFileSystem(conf);
    if (fs.exists(parquetFile)) {
        fs.delete(parquetFile, true);/*  ww  w  . jav a 2s . c  o m*/
    }

    //create a test file
    final TProtocolFactory protocolFactory = new TCompactProtocol.Factory();
    final TaskAttemptID taskId = new TaskAttemptID("local", 0, true, 0, 0);
    final ThriftToParquetFileWriter w = new ThriftToParquetFileWriter(parquetFile,
            ContextUtil.newTaskAttemptContext(conf, taskId), protocolFactory, thriftClass);
    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    final TProtocol protocol = protocolFactory.getProtocol(new TIOStreamTransport(baos));

    recordToWrite.write(protocol);
    w.write(new BytesWritable(baos.toByteArray()));
    w.close();

    final ParquetThriftInputFormat<T> parquetThriftInputFormat = new ParquetThriftInputFormat<T>();
    final Job job = new Job(conf, "read");
    job.setInputFormatClass(ParquetThriftInputFormat.class);
    ParquetThriftInputFormat.setInputPaths(job, parquetFile);
    final JobID jobID = new JobID("local", 1);
    List<InputSplit> splits = parquetThriftInputFormat
            .getSplits(ContextUtil.newJobContext(ContextUtil.getConfiguration(job), jobID));
    T readValue = null;
    for (InputSplit split : splits) {
        TaskAttemptContext taskAttemptContext = ContextUtil.newTaskAttemptContext(
                ContextUtil.getConfiguration(job), new TaskAttemptID(new TaskID(jobID, true, 1), 0));
        final RecordReader<Void, T> reader = parquetThriftInputFormat.createRecordReader(split,
                taskAttemptContext);
        reader.initialize(split, taskAttemptContext);
        if (reader.nextKeyValue()) {
            readValue = reader.getCurrentValue();
            LOG.info(readValue);
        }
    }
    assertEquals(exptectedReadResult, readValue);

}

From source file:org.apache.rya.accumulo.mr.GraphXEdgeInputFormatTest.java

License:Apache License

@SuppressWarnings("rawtypes")
@Test//from w  w  w.j  a v  a 2  s.c om
public void testInputFormat() throws Exception {
    RyaStatement input = RyaStatement.builder().setSubject(new RyaURI("http://www.google.com"))
            .setPredicate(new RyaURI("http://some_other_uri")).setObject(new RyaURI("http://www.yahoo.com"))
            .setColumnVisibility(new byte[0]).setValue(new byte[0]).build();

    apiImpl.add(input);

    Job jobConf = Job.getInstance();

    GraphXEdgeInputFormat.setMockInstance(jobConf, instance.getInstanceName());
    GraphXEdgeInputFormat.setConnectorInfo(jobConf, username, password);
    GraphXEdgeInputFormat.setTableLayout(jobConf, TABLE_LAYOUT.SPO);
    GraphXEdgeInputFormat.setInputTableName(jobConf, table);
    GraphXEdgeInputFormat.setInputTableName(jobConf, table);

    GraphXEdgeInputFormat.setScanIsolation(jobConf, false);
    GraphXEdgeInputFormat.setLocalIterators(jobConf, false);
    GraphXEdgeInputFormat.setOfflineTableScan(jobConf, false);

    GraphXEdgeInputFormat inputFormat = new GraphXEdgeInputFormat();

    JobContext context = new JobContextImpl(jobConf.getConfiguration(), jobConf.getJobID());

    List<InputSplit> splits = inputFormat.getSplits(context);

    Assert.assertEquals(1, splits.size());

    TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(context.getConfiguration(),
            new TaskAttemptID(new TaskID(), 1));

    RecordReader reader = inputFormat.createRecordReader(splits.get(0), taskAttemptContext);

    RecordReader ryaStatementRecordReader = (RecordReader) reader;
    ryaStatementRecordReader.initialize(splits.get(0), taskAttemptContext);

    List<Edge> results = new ArrayList<Edge>();
    while (ryaStatementRecordReader.nextKeyValue()) {
        Edge writable = (Edge) ryaStatementRecordReader.getCurrentValue();
        long srcId = writable.srcId();
        long destId = writable.dstId();
        RyaTypeWritable rtw = null;
        Object text = ryaStatementRecordReader.getCurrentKey();
        Edge<RyaTypeWritable> edge = new Edge<RyaTypeWritable>(srcId, destId, rtw);
        results.add(edge);

        System.out.println(text);
    }

    System.out.println(results.size());
    System.out.println(results);
    Assert.assertTrue(results.size() == 2);
}

From source file:org.apache.tinkerpop.gremlin.hadoop.structure.io.RecordReaderWriterTest.java

License:Apache License

private static void validateFileSplits(final List<FileSplit> fileSplits, final Configuration configuration,
        final Class<? extends InputFormat<NullWritable, VertexWritable>> inputFormatClass,
        final Optional<Class<? extends OutputFormat<NullWritable, VertexWritable>>> outFormatClass)
        throws Exception {

    final InputFormat inputFormat = ReflectionUtils.newInstance(inputFormatClass, configuration);
    final TaskAttemptContext job = new TaskAttemptContextImpl(configuration,
            new TaskAttemptID(UUID.randomUUID().toString(), 0, TaskType.MAP, 0, 0));

    int vertexCount = 0;
    int outEdgeCount = 0;
    int inEdgeCount = 0;

    final OutputFormat<NullWritable, VertexWritable> outputFormat = outFormatClass.isPresent()
            ? ReflectionUtils.newInstance(outFormatClass.get(), configuration)
            : null;/*from   w w  w  . java2s.c  om*/
    final RecordWriter<NullWritable, VertexWritable> writer = null == outputFormat ? null
            : outputFormat.getRecordWriter(job);

    boolean foundKeyValue = false;
    for (final FileSplit split : fileSplits) {
        logger.info("\treading file split {}", split.getPath().getName() + " ({}",
                split.getStart() + "..." + (split.getStart() + split.getLength()), "{} {} bytes)");
        final RecordReader reader = inputFormat.createRecordReader(split, job);

        float lastProgress = -1f;
        while (reader.nextKeyValue()) {
            //System.out.println("" + reader.getProgress() + "> " + reader.getCurrentKey() + ": " + reader.getCurrentValue());
            final float progress = reader.getProgress();
            assertTrue(progress >= lastProgress);
            assertEquals(NullWritable.class, reader.getCurrentKey().getClass());
            final VertexWritable vertexWritable = (VertexWritable) reader.getCurrentValue();
            if (null != writer)
                writer.write(NullWritable.get(), vertexWritable);
            vertexCount++;
            outEdgeCount = outEdgeCount + (int) IteratorUtils.count(vertexWritable.get().edges(Direction.OUT));
            inEdgeCount = inEdgeCount + (int) IteratorUtils.count(vertexWritable.get().edges(Direction.IN));
            //
            final Vertex vertex = vertexWritable.get();
            assertEquals(Integer.class, vertex.id().getClass());
            if (vertex.value("name").equals("SUGAR MAGNOLIA")) {
                foundKeyValue = true;
                assertEquals(92, IteratorUtils.count(vertex.edges(Direction.OUT)));
                assertEquals(77, IteratorUtils.count(vertex.edges(Direction.IN)));
            }
            lastProgress = progress;
        }
    }

    assertEquals(8049, outEdgeCount);
    assertEquals(8049, inEdgeCount);
    assertEquals(outEdgeCount, inEdgeCount);
    assertEquals(808, vertexCount);
    assertTrue(foundKeyValue);

    if (null != writer) {
        writer.close(new TaskAttemptContextImpl(configuration, job.getTaskAttemptID()));
        for (int i = 1; i < 10; i++) {
            final File outputDirectory = new File(
                    new URL(configuration.get("mapreduce.output.fileoutputformat.outputdir")).toURI());
            final List<FileSplit> splits = generateFileSplits(
                    new File(outputDirectory.getAbsoluteFile() + "/_temporary/0/_temporary/"
                            + job.getTaskAttemptID().getTaskID().toString().replace("task", "attempt") + "_0"
                            + "/part-m-00000"),
                    i);
            validateFileSplits(splits, configuration, inputFormatClass, Optional.empty());
        }
    }
}

From source file:org.apache.vxquery.metadata.VXQueryCollectionOperatorDescriptor.java

License:Apache License

@Override
public IOperatorNodePushable createPushRuntime(IHyracksTaskContext ctx,
        IRecordDescriptorProvider recordDescProvider, int partition, int nPartitions)
        throws HyracksDataException {
    final FrameTupleAccessor fta = new FrameTupleAccessor(ctx.getFrameSize(),
            recordDescProvider.getInputRecordDescriptor(getActivityId(), 0));
    final int fieldOutputCount = recordDescProvider.getOutputRecordDescriptor(getActivityId(), 0)
            .getFieldCount();//from   ww  w  .j a  v  a  2  s.  c  o m
    final ByteBuffer frame = ctx.allocateFrame();
    final FrameTupleAppender appender = new FrameTupleAppender(ctx.getFrameSize(), fieldOutputCount);
    final short partitionId = (short) ctx.getTaskAttemptId().getTaskId().getPartition();
    final ITreeNodeIdProvider nodeIdProvider = new TreeNodeIdProvider(partitionId, dataSourceId,
            totalDataSources);
    final String nodeId = ctx.getJobletContext().getApplicationContext().getNodeId();
    final DynamicContext dCtx = (DynamicContext) ctx.getJobletContext().getGlobalJobData();

    final String collectionName = collectionPartitions[partition % collectionPartitions.length];
    final XMLParser parser = new XMLParser(false, nodeIdProvider, nodeId, frame, appender, childSeq,
            dCtx.getStaticContext());

    return new AbstractUnaryInputUnaryOutputOperatorNodePushable() {
        @Override
        public void open() throws HyracksDataException {
            appender.reset(frame, true);
            writer.open();
            hdfs = new HDFSFunctions();
        }

        @Override
        public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
            fta.reset(buffer);
            String collectionModifiedName = collectionName.replace("${nodeId}", nodeId);
            if (!collectionModifiedName.contains("hdfs:/")) {
                File collectionDirectory = new File(collectionModifiedName);
                //check if directory is in the local file system
                if (collectionDirectory.exists()) {
                    // Go through each tuple.
                    if (collectionDirectory.isDirectory()) {
                        for (int tupleIndex = 0; tupleIndex < fta.getTupleCount(); ++tupleIndex) {
                            Iterator<File> it = FileUtils.iterateFiles(collectionDirectory,
                                    new VXQueryIOFileFilter(), TrueFileFilter.INSTANCE);
                            while (it.hasNext()) {
                                File xmlDocument = it.next();
                                if (LOGGER.isLoggable(Level.FINE)) {
                                    LOGGER.fine(
                                            "Starting to read XML document: " + xmlDocument.getAbsolutePath());
                                }
                                parser.parseElements(xmlDocument, writer, fta, tupleIndex);
                            }
                        }
                    } else {
                        throw new HyracksDataException("Invalid directory parameter (" + nodeId + ":"
                                + collectionDirectory.getAbsolutePath() + ") passed to collection.");
                    }
                }
            } else {
                // Else check in HDFS file system
                // Get instance of the HDFS filesystem
                FileSystem fs = hdfs.getFileSystem();
                if (fs != null) {
                    collectionModifiedName = collectionModifiedName.replaceAll("hdfs:/", "");
                    Path directory = new Path(collectionModifiedName);
                    Path xmlDocument;
                    if (tag != null) {
                        hdfs.setJob(directory.getName(), tag);
                        tag = "<" + tag + ">";
                        Job job = hdfs.getJob();
                        InputFormat inputFormat = hdfs.getinputFormat();
                        try {
                            hdfs.scheduleSplits();
                            ArrayList<Integer> schedule = hdfs
                                    .getScheduleForNode(InetAddress.getLocalHost().getHostName());
                            List<InputSplit> splits = hdfs.getSplits();
                            List<FileSplit> fileSplits = new ArrayList<FileSplit>();
                            for (int i : schedule) {
                                fileSplits.add((FileSplit) splits.get(i));
                            }
                            FileSplitsFactory splitsFactory = new FileSplitsFactory(fileSplits);
                            List<FileSplit> inputSplits = splitsFactory.getSplits();
                            ContextFactory ctxFactory = new ContextFactory();
                            int size = inputSplits.size();
                            InputStream stream;
                            String value;
                            RecordReader reader;
                            TaskAttemptContext context;
                            for (int i = 0; i < size; i++) {
                                //read split
                                context = ctxFactory.createContext(job.getConfiguration(), i);
                                try {
                                    reader = inputFormat.createRecordReader(inputSplits.get(i), context);
                                    reader.initialize(inputSplits.get(i), context);
                                    while (reader.nextKeyValue()) {
                                        value = reader.getCurrentValue().toString();
                                        //Split value if it contains more than one item with the tag
                                        if (StringUtils.countMatches(value, tag) > 1) {
                                            String items[] = value.split(tag);
                                            for (String item : items) {
                                                if (item.length() > 0) {
                                                    item = START_TAG + tag + item;
                                                    stream = new ByteArrayInputStream(
                                                            item.getBytes(StandardCharsets.UTF_8));
                                                    parser.parseHDFSElements(stream, writer, fta, i);
                                                }
                                            }
                                        } else {
                                            value = START_TAG + value;
                                            //create an input stream to the file currently reading and send it to parser
                                            stream = new ByteArrayInputStream(
                                                    value.getBytes(StandardCharsets.UTF_8));
                                            parser.parseHDFSElements(stream, writer, fta, i);
                                        }
                                    }

                                } catch (InterruptedException e) {
                                    if (LOGGER.isLoggable(Level.SEVERE)) {
                                        LOGGER.severe(e.getMessage());
                                    }
                                }
                            }

                        } catch (IOException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        } catch (ParserConfigurationException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        } catch (SAXException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        }
                    } else {
                        try {
                            //check if the path exists and is a directory
                            if (fs.exists(directory) && fs.isDirectory(directory)) {
                                for (int tupleIndex = 0; tupleIndex < fta.getTupleCount(); ++tupleIndex) {
                                    //read every file in the directory
                                    RemoteIterator<LocatedFileStatus> it = fs.listFiles(directory, true);
                                    while (it.hasNext()) {
                                        xmlDocument = it.next().getPath();
                                        if (fs.isFile(xmlDocument)) {
                                            if (LOGGER.isLoggable(Level.FINE)) {
                                                LOGGER.fine("Starting to read XML document: "
                                                        + xmlDocument.getName());
                                            }
                                            //create an input stream to the file currently reading and send it to parser
                                            InputStream in = fs.open(xmlDocument).getWrappedStream();
                                            parser.parseHDFSElements(in, writer, fta, tupleIndex);
                                        }
                                    }
                                }
                            } else {
                                throw new HyracksDataException("Invalid HDFS directory parameter (" + nodeId
                                        + ":" + directory + ") passed to collection.");
                            }
                        } catch (FileNotFoundException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        } catch (IOException e) {
                            if (LOGGER.isLoggable(Level.SEVERE)) {
                                LOGGER.severe(e.getMessage());
                            }
                        }
                    }
                    try {
                        fs.close();
                    } catch (IOException e) {
                        if (LOGGER.isLoggable(Level.SEVERE)) {
                            LOGGER.severe(e.getMessage());
                        }
                    }
                }
            }
        }

        @Override
        public void fail() throws HyracksDataException {
            writer.fail();
        }

        @Override
        public void close() throws HyracksDataException {
            // Check if needed?
            fta.reset(frame);
            if (fta.getTupleCount() > 0) {
                FrameUtils.flushFrame(frame, writer);
            }
            writer.close();
        }
    };
}