Example usage for org.apache.hadoop.mapreduce InputSplit getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce InputSplit getLength.

Prototype

public abstract long getLength() throws IOException, InterruptedException;

Source Link

Document

Get the size of the split, so that the input splits can be sorted by size.

Usage

From source file:com.marklogic.contentpump.CompressedRDFReader.java

License:Apache License

@Override
protected void initStream(InputSplit inSplit) throws IOException, InterruptedException {
    setFile(((FileSplit) inSplit).getPath());
    FSDataInputStream fileIn = fs.open(file);
    URI zipURI = file.toUri();//from   ww w .  ja  va 2  s.  c  om
    String codecString = conf.get(ConfigConstants.CONF_INPUT_COMPRESSION_CODEC,
            CompressionCodec.ZIP.toString());
    if (codecString.equalsIgnoreCase(CompressionCodec.ZIP.toString())) {
        zipIn = new ZipInputStream(fileIn);
        codec = CompressionCodec.ZIP;
        while (true) {
            try {
                currZipEntry = ((ZipInputStream) zipIn).getNextEntry();
                if (currZipEntry == null) {
                    break;
                }
                if (currZipEntry.getSize() != 0) {
                    subId = currZipEntry.getName();
                    break;
                }
            } catch (IllegalArgumentException e) {
                LOG.warn("Skipped a zip entry in : " + file.toUri() + ", reason: " + e.getMessage());
            }
        }
        if (currZipEntry == null) { // no entry in zip
            LOG.warn("No valid entry in zip:" + file.toUri());
            return;
        }
        ByteArrayOutputStream baos;
        long size = currZipEntry.getSize();
        if (size == -1) {
            baos = new ByteArrayOutputStream();
            // if we don't know the size, assume it's big!
            initParser(zipURI.toASCIIString() + "/" + subId, INMEMORYTHRESHOLD);
        } else {
            baos = new ByteArrayOutputStream((int) size);
            initParser(zipURI.toASCIIString() + "/" + subId, size);
        }
        int nb;
        while ((nb = zipIn.read(buf, 0, buf.length)) != -1) {
            baos.write(buf, 0, nb);
        }
        parse(subId, new ByteArrayInputStream(baos.toByteArray()));
    } else if (codecString.equalsIgnoreCase(CompressionCodec.GZIP.toString())) {
        long size = inSplit.getLength();
        zipIn = new GZIPInputStream(fileIn);
        codec = CompressionCodec.GZIP;
        initParser(zipURI.toASCIIString(), size * COMPRESSIONFACTOR);
        parse(file.getName(), zipIn);
    } else {
        throw new UnsupportedOperationException("Unsupported codec: " + codec.name());
    }
}

From source file:com.marklogic.contentpump.DelimitedJSONReader.java

License:Apache License

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    /* Initialization in super class */
    initConfig(context);/*  w  w  w  .  ja v  a 2 s  . c o  m*/
    /*  Get file(s) in input split */
    setFile(((FileSplit) inSplit).getPath());
    // Initialize reader properties
    generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false);
    if (generateId) {
        idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart());
    } else {
        uriName = conf.get(CONF_INPUT_URI_ID, null);
        mapper = new ObjectMapper();
    }
    bytesRead = 0;
    totalBytes = inSplit.getLength();
    /* Check file status */
    fs = file.getFileSystem(context.getConfiguration());
    FileStatus status = fs.getFileStatus(file);
    if (status.isDirectory()) {
        iterator = new FileIterator((FileSplit) inSplit, context);
        inSplit = iterator.next();
    }
    /* Initialize buffered reader */
    initFileStream(inSplit);
}

From source file:com.marklogic.contentpump.DelimitedTextReader.java

License:Apache License

protected void initParser(InputSplit inSplit) throws IOException, InterruptedException {
    setFile(((FileSplit) inSplit).getPath());
    configFileNameAsCollection(conf, file);

    fileIn = fs.open(file);/*www  . j a  v  a 2 s  .  c  om*/
    instream = new InputStreamReader(fileIn, encoding);

    bytesRead = 0;
    fileLen = inSplit.getLength();
    if (uriName == null) {
        generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false);
        if (generateId) {
            idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart());
        } else {
            uriId = 0;
        }
    }
    parser = new CSVParser(instream, CSVParserFormatter.getFormat(delimiter, encapsulator, true, true));
    parserIterator = parser.iterator();
}

From source file:com.marklogic.contentpump.RDFReader.java

License:Apache License

protected void initStream(InputSplit inSplit) throws IOException, InterruptedException {
    setFile(((FileSplit) inSplit).getPath());
    long size = inSplit.getLength();
    initParser(file.toUri().toASCIIString(), size);
    parse(file.getName());/* w w w .  j  ava 2s  .  com*/
}

From source file:com.marklogic.contentpump.SplitDelimitedTextReader.java

License:Apache License

@Override
protected void initParser(InputSplit inSplit) throws IOException, InterruptedException {
    setFile(((DelimitedSplit) inSplit).getPath());
    configFileNameAsCollection(conf, file);

    // get header from the DelimitedSplit
    TextArrayWritable taw = ((DelimitedSplit) inSplit).getHeader();
    fields = taw.toStrings();//ww  w .  j av a2  s. c  om
    try {
        docBuilder.configFields(conf, fields);
    } catch (IllegalArgumentException e) {
        LOG.error("Skipped file: " + file.toUri() + ", reason: " + e.getMessage());
        return;
    }

    fileIn = fs.open(file);
    lineSeparator = retrieveLineSeparator(fileIn);
    if (start != 0) {
        // in case the cut point is \n, back off 1 char to create a partial
        // line so that 1st line can be skipped
        start--;
    }

    fileIn.seek(start);

    instream = new InputStreamReader(fileIn, encoding);

    bytesRead = 0;
    fileLen = inSplit.getLength();
    if (uriName == null) {
        generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false);
        if (generateId) {
            idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart());
        } else {
            uriId = 0;
        }
    }

    boolean found = generateId || uriId == 0;

    for (int i = 0; i < fields.length && !found; i++) {
        if (fields[i].equals(uriName)) {
            uriId = i;
            found = true;
            break;
        }
    }
    if (found == false) {
        // idname doesn't match any columns
        LOG.error("Skipped file: " + file.toUri() + ", reason: " + URI_ID + " " + uriName + " is not found");
        return;
    }

    // keep leading and trailing whitespaces to ensure accuracy of pos
    // do not skip empty line just in case the split boundary is \n
    parser = new CSVParser(instream, CSVParserFormatter.getFormat(delimiter, encapsulator, false, false));
    parserIterator = parser.iterator();

    // skip first line:
    // 1st split, skip header; other splits, skip partial line
    if (parserIterator.hasNext()) {
        String[] values = getLine();
        start += getBytesCountFromLine(values);
        pos = start;
    }
}

From source file:com.marklogic.mapreduce.examples.ContentLoader.java

License:Apache License

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    bytesTotal = inSplit.getLength();
    Path file = ((FileSplit) inSplit).getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    key.set(file.toString());//ww w.ja va 2  s  . c om
    byte[] buf = new byte[(int) inSplit.getLength()];
    try {
        fileIn.readFully(buf);
        value.set(buf);
        hasNext = true;
    } catch (Exception e) {
        hasNext = false;
    } finally {
        fileIn.close();
    }
}

From source file:com.marklogic.mapreduce.examples.WikiLoader.java

License:Apache License

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    Path file = ((FileSplit) inSplit).getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    byte[] buf = new byte[BUFFER_SIZE];
    long bytesTotal = inSplit.getLength();
    long start = ((FileSplit) inSplit).getStart();
    fileIn.seek(start);/*from w  ww.j a v  a2 s  . co  m*/
    long bytesRead = 0;
    StringBuilder pages = new StringBuilder();
    int sindex = -1;
    while (true) {
        int length = (int) Math.min(bytesTotal - bytesRead, buf.length);
        int read = fileIn.read(buf, 0, length);
        if (read == -1) {
            System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead);
            break;
        }
        bytesRead += read;
        String temp = new String(new String(buf, 0, read));
        if (sindex == -1) { // haven't found the start yet    
            sindex = temp.indexOf(BEGIN_PAGE_TAG);
            if (sindex > -1) {
                pages.append(temp.substring(sindex));
            }
        } else if (bytesRead < bytesTotal) { // haven't completed the split
            pages.append(temp);
        } else { // reached the end of this split
            // look for end
            int eindex = 0;
            if (temp.contains(END_DOC_TAG) || // reached the end of doc
                    temp.endsWith(END_PAGE_TAG)) {
                eindex = temp.lastIndexOf(END_PAGE_TAG);
                pages.append(temp.substring(0, eindex + END_PAGE_TAG.length()));
                System.out.println("Found end of doc.");
            } else { // need to read ahead to look for end of page
                while (true) {
                    read = fileIn.read(buf, 0, READ_AHEAD_SIZE);
                    if (read == -1) { // no more to read
                        System.out
                                .println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead);
                        System.out.println(temp);
                        break;
                    }
                    bytesRead += read;
                    // look for end
                    temp = new String(buf, 0, read);
                    eindex = temp.indexOf(END_PAGE_TAG);
                    if (eindex > -1) {
                        pages.append(temp.substring(0, eindex + END_PAGE_TAG.length()));
                        break;
                    } else {
                        pages.append(temp);
                    }
                }
            }
            break;
        }
    }
    fileIn.close();
    articles = WikiModelProcessor.process(pages);
}

From source file:com.tomslabs.grid.avro.AvroRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) split;
    Configuration config = context.getConfiguration();
    Path path = fileSplit.getPath();

    this.in = new FsInput(path, config);

    DatumReader<T> datumReader = getDatumReader(config);

    this.reader = new DataFileReader<T>(in, datumReader);
    reader.sync(fileSplit.getStart()); // sync to start

    this.start = in.tell();
    this.end = fileSplit.getStart() + split.getLength();
}

From source file:com.twitter.elephanttwin.retrieval.IndexedFilterRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {

    this.context = context;
    if (split.getLength() <= 0) {
        filterReader = null;//from   w w w.  j  av  a2s.c o m
        return;
    }

    fileSplits = computeFileSplits(split, context);
    subBlockCnt = fileSplits.size();

    if (subBlockCnt == 0) {
        filterReader = null;
        return;
    }
    currentSubBlock = 1;
    filterReader.initialize(fileSplits.remove(0), context);
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.RTreeRecordReader3.java

License:Open Source License

public void initialize(InputSplit split, Configuration conf) throws IOException, InterruptedException {
    LOG.info("Open a SpatialRecordReader to split: " + split);
    FileSplit fsplit = (FileSplit) split;
    this.path = fsplit.getPath();
    this.start = fsplit.getStart();
    this.end = this.start + split.getLength();
    this.fs = this.path.getFileSystem(conf);
    this.directIn = fs.open(this.path);
    codec = new CompressionCodecFactory(conf).getCodec(this.path);

    if (codec != null) {
        // Input is compressed, create a decompressor to decompress it
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            // A splittable compression codec, can seek to the desired input pos
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    directIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new DataInputStream(cIn);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();/*  w  ww  .j ava  2 s .c om*/
            // take pos from compressed stream as we adjusted both start and end
            // to match with the compressed file
            filePosition = cIn;
        } else {
            // Non-splittable input, need to start from the beginning
            CompressionInputStream cIn = codec.createInputStream(directIn, decompressor);
            in = new DataInputStream(cIn);
            filePosition = cIn;
        }
    } else {
        // Non-compressed file, seek to the desired position and use this stream
        // to get the progress and position
        directIn.seek(start);
        in = directIn;
        filePosition = directIn;
    }
    byte[] signature = new byte[8];
    in.readFully(signature);
    if (!Arrays.equals(signature, SpatialSite.RTreeFileMarkerB)) {
        throw new RuntimeException("Incorrect signature for RTree");
    }
    this.stockShape = (V) OperationsParams.getShape(conf, "shape");

    if (conf.get(SpatialInputFormat3.InputQueryRange) != null) {
        // Retrieve the input query range to apply on all records
        this.inputQueryRange = OperationsParams.getShape(conf, SpatialInputFormat3.InputQueryRange);
        this.inputQueryMBR = this.inputQueryRange.getMBR();
    }

    // Check if there is an associated global index to read cell boundaries
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, path.getParent());
    if (gindex == null) {
        cellMBR = new Partition();
        cellMBR.invalidate();
    } else {
        // Set from the associated partition in the global index
        for (Partition p : gindex) {
            if (p.filename.equals(this.path.getName()))
                cellMBR = p;
        }
    }
}