Example usage for org.apache.hadoop.mapreduce InputSplit getLength

List of usage examples for org.apache.hadoop.mapreduce InputSplit getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce InputSplit getLength.

Prototype

public abstract long getLength() throws IOException, InterruptedException;

Source Link

Document

Get the size of the split, so that the input splits can be sorted by size.

Usage

From source file:com.marklogic.contentpump.CompressedRDFReader.java

License:Apache License

@Override
protected void initStream(InputSplit inSplit) throws IOException, InterruptedException {
    setFile(((FileSplit) inSplit).getPath());
    FSDataInputStream fileIn = fs.open(file);
    URI zipURI = file.toUri();//from   ww w .  ja  va 2  s.  c  om
    String codecString = conf.get(ConfigConstants.CONF_INPUT_COMPRESSION_CODEC,
            CompressionCodec.ZIP.toString());
    if (codecString.equalsIgnoreCase(CompressionCodec.ZIP.toString())) {
        zipIn = new ZipInputStream(fileIn);
        codec = CompressionCodec.ZIP;
        while (true) {
            try {
                currZipEntry = ((ZipInputStream) zipIn).getNextEntry();
                if (currZipEntry == null) {
                    break;
                }
                if (currZipEntry.getSize() != 0) {
                    subId = currZipEntry.getName();
                    break;
                }
            } catch (IllegalArgumentException e) {
                LOG.warn("Skipped a zip entry in : " + file.toUri() + ", reason: " + e.getMessage());
            }
        }
        if (currZipEntry == null) { // no entry in zip
            LOG.warn("No valid entry in zip:" + file.toUri());
            return;
        }
        ByteArrayOutputStream baos;
        long size = currZipEntry.getSize();
        if (size == -1) {
            baos = new ByteArrayOutputStream();
            // if we don't know the size, assume it's big!
            initParser(zipURI.toASCIIString() + "/" + subId, INMEMORYTHRESHOLD);
        } else {
            baos = new ByteArrayOutputStream((int) size);
            initParser(zipURI.toASCIIString() + "/" + subId, size);
        }
        int nb;
        while ((nb = zipIn.read(buf, 0, buf.length)) != -1) {
            baos.write(buf, 0, nb);
        }
        parse(subId, new ByteArrayInputStream(baos.toByteArray()));
    } else if (codecString.equalsIgnoreCase(CompressionCodec.GZIP.toString())) {
        long size = inSplit.getLength();
        zipIn = new GZIPInputStream(fileIn);
        codec = CompressionCodec.GZIP;
        initParser(zipURI.toASCIIString(), size * COMPRESSIONFACTOR);
        parse(file.getName(), zipIn);
    } else {
        throw new UnsupportedOperationException("Unsupported codec: " + codec.name());
    }
}

From source file:com.marklogic.contentpump.DelimitedJSONReader.java

License:Apache License

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    /* Initialization in super class */
    initConfig(context);/*  w  w  w  .  ja v  a 2 s  . c o  m*/
    /*  Get file(s) in input split */
    setFile(((FileSplit) inSplit).getPath());
    // Initialize reader properties
    generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false);
    if (generateId) {
        idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart());
    } else {
        uriName = conf.get(CONF_INPUT_URI_ID, null);
        mapper = new ObjectMapper();
    }
    bytesRead = 0;
    totalBytes = inSplit.getLength();
    /* Check file status */
    fs = file.getFileSystem(context.getConfiguration());
    FileStatus status = fs.getFileStatus(file);
    if (status.isDirectory()) {
        iterator = new FileIterator((FileSplit) inSplit, context);
        inSplit = iterator.next();
    }
    /* Initialize buffered reader */
    initFileStream(inSplit);
}

From source file:com.marklogic.contentpump.DelimitedTextReader.java

License:Apache License

protected void initParser(InputSplit inSplit) throws IOException, InterruptedException {
    setFile(((FileSplit) inSplit).getPath());
    configFileNameAsCollection(conf, file);

    fileIn = fs.open(file);/*www  . j a  v  a 2 s  .  c  om*/
    instream = new InputStreamReader(fileIn, encoding);

    bytesRead = 0;
    fileLen = inSplit.getLength();
    if (uriName == null) {
        generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false);
        if (generateId) {
            idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart());
        } else {
            uriId = 0;
        }
    }
    parser = new CSVParser(instream, CSVParserFormatter.getFormat(delimiter, encapsulator, true, true));
    parserIterator = parser.iterator();
}

From source file:com.marklogic.contentpump.RDFReader.java

License:Apache License

protected void initStream(InputSplit inSplit) throws IOException, InterruptedException {
    setFile(((FileSplit) inSplit).getPath());
    long size = inSplit.getLength();
    initParser(file.toUri().toASCIIString(), size);
    parse(file.getName());/* w w w .  j  ava 2s  .  com*/
}

From source file:com.marklogic.contentpump.SplitDelimitedTextReader.java

License:Apache License

@Override
protected void initParser(InputSplit inSplit) throws IOException, InterruptedException {
    setFile(((DelimitedSplit) inSplit).getPath());
    configFileNameAsCollection(conf, file);

    // get header from the DelimitedSplit
    TextArrayWritable taw = ((DelimitedSplit) inSplit).getHeader();
    fields = taw.toStrings();//ww  w .  j av a2  s. c  om
    try {
        docBuilder.configFields(conf, fields);
    } catch (IllegalArgumentException e) {
        LOG.error("Skipped file: " + file.toUri() + ", reason: " + e.getMessage());
        return;
    }

    fileIn = fs.open(file);
    lineSeparator = retrieveLineSeparator(fileIn);
    if (start != 0) {
        // in case the cut point is \n, back off 1 char to create a partial
        // line so that 1st line can be skipped
        start--;
    }

    fileIn.seek(start);

    instream = new InputStreamReader(fileIn, encoding);

    bytesRead = 0;
    fileLen = inSplit.getLength();
    if (uriName == null) {
        generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false);
        if (generateId) {
            idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart());
        } else {
            uriId = 0;
        }
    }

    boolean found = generateId || uriId == 0;

    for (int i = 0; i < fields.length && !found; i++) {
        if (fields[i].equals(uriName)) {
            uriId = i;
            found = true;
            break;
        }
    }
    if (found == false) {
        // idname doesn't match any columns
        LOG.error("Skipped file: " + file.toUri() + ", reason: " + URI_ID + " " + uriName + " is not found");
        return;
    }

    // keep leading and trailing whitespaces to ensure accuracy of pos
    // do not skip empty line just in case the split boundary is \n
    parser = new CSVParser(instream, CSVParserFormatter.getFormat(delimiter, encapsulator, false, false));
    parserIterator = parser.iterator();

    // skip first line:
    // 1st split, skip header; other splits, skip partial line
    if (parserIterator.hasNext()) {
        String[] values = getLine();
        start += getBytesCountFromLine(values);
        pos = start;
    }
}

From source file:com.marklogic.mapreduce.examples.ContentLoader.java

License:Apache License

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    bytesTotal = inSplit.getLength();
    Path file = ((FileSplit) inSplit).getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    key.set(file.toString());//ww w.ja va 2  s  . c om
    byte[] buf = new byte[(int) inSplit.getLength()];
    try {
        fileIn.readFully(buf);
        value.set(buf);
        hasNext = true;
    } catch (Exception e) {
        hasNext = false;
    } finally {
        fileIn.close();
    }
}

From source file:com.marklogic.mapreduce.examples.WikiLoader.java

License:Apache License

@Override
public void initialize(InputSplit inSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {
    Path file = ((FileSplit) inSplit).getPath();
    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FSDataInputStream fileIn = fs.open(file);
    byte[] buf = new byte[BUFFER_SIZE];
    long bytesTotal = inSplit.getLength();
    long start = ((FileSplit) inSplit).getStart();
    fileIn.seek(start);/*from w  ww.j a v  a2 s  . co  m*/
    long bytesRead = 0;
    StringBuilder pages = new StringBuilder();
    int sindex = -1;
    while (true) {
        int length = (int) Math.min(bytesTotal - bytesRead, buf.length);
        int read = fileIn.read(buf, 0, length);
        if (read == -1) {
            System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead);
            break;
        }
        bytesRead += read;
        String temp = new String(new String(buf, 0, read));
        if (sindex == -1) { // haven't found the start yet    
            sindex = temp.indexOf(BEGIN_PAGE_TAG);
            if (sindex > -1) {
                pages.append(temp.substring(sindex));
            }
        } else if (bytesRead < bytesTotal) { // haven't completed the split
            pages.append(temp);
        } else { // reached the end of this split
            // look for end
            int eindex = 0;
            if (temp.contains(END_DOC_TAG) || // reached the end of doc
                    temp.endsWith(END_PAGE_TAG)) {
                eindex = temp.lastIndexOf(END_PAGE_TAG);
                pages.append(temp.substring(0, eindex + END_PAGE_TAG.length()));
                System.out.println("Found end of doc.");
            } else { // need to read ahead to look for end of page
                while (true) {
                    read = fileIn.read(buf, 0, READ_AHEAD_SIZE);
                    if (read == -1) { // no more to read
                        System.out
                                .println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead);
                        System.out.println(temp);
                        break;
                    }
                    bytesRead += read;
                    // look for end
                    temp = new String(buf, 0, read);
                    eindex = temp.indexOf(END_PAGE_TAG);
                    if (eindex > -1) {
                        pages.append(temp.substring(0, eindex + END_PAGE_TAG.length()));
                        break;
                    } else {
                        pages.append(temp);
                    }
                }
            }
            break;
        }
    }
    fileIn.close();
    articles = WikiModelProcessor.process(pages);
}

From source file:com.tomslabs.grid.avro.AvroRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
    FileSplit fileSplit = (FileSplit) split;
    Configuration config = context.getConfiguration();
    Path path = fileSplit.getPath();

    this.in = new FsInput(path, config);

    DatumReader<T> datumReader = getDatumReader(config);

    this.reader = new DataFileReader<T>(in, datumReader);
    reader.sync(fileSplit.getStart()); // sync to start

    this.start = in.tell();
    this.end = fileSplit.getStart() + split.getLength();
}

From source file:com.twitter.elephanttwin.retrieval.IndexedFilterRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {

    this.context = context;
    if (split.getLength() <= 0) {
        filterReader = null;//from   w w w.  j  av  a2s.c o m
        return;
    }

    fileSplits = computeFileSplits(split, context);
    subBlockCnt = fileSplits.size();

    if (subBlockCnt == 0) {
        filterReader = null;
        return;
    }
    currentSubBlock = 1;
    filterReader.initialize(fileSplits.remove(0), context);
}

From source file:edu.umn.cs.spatialHadoop.mapreduce.RTreeRecordReader3.java

License:Open Source License

public void initialize(InputSplit split, Configuration conf) throws IOException, InterruptedException {
    LOG.info("Open a SpatialRecordReader to split: " + split);
    FileSplit fsplit = (FileSplit) split;
    this.path = fsplit.getPath();
    this.start = fsplit.getStart();
    this.end = this.start + split.getLength();
    this.fs = this.path.getFileSystem(conf);
    this.directIn = fs.open(this.path);
    codec = new CompressionCodecFactory(conf).getCodec(this.path);

    if (codec != null) {
        // Input is compressed, create a decompressor to decompress it
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            // A splittable compression codec, can seek to the desired input pos
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    directIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new DataInputStream(cIn);
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();/*  w  ww  .j ava  2 s .c om*/
            // take pos from compressed stream as we adjusted both start and end
            // to match with the compressed file
            filePosition = cIn;
        } else {
            // Non-splittable input, need to start from the beginning
            CompressionInputStream cIn = codec.createInputStream(directIn, decompressor);
            in = new DataInputStream(cIn);
            filePosition = cIn;
        }
    } else {
        // Non-compressed file, seek to the desired position and use this stream
        // to get the progress and position
        directIn.seek(start);
        in = directIn;
        filePosition = directIn;
    }
    byte[] signature = new byte[8];
    in.readFully(signature);
    if (!Arrays.equals(signature, SpatialSite.RTreeFileMarkerB)) {
        throw new RuntimeException("Incorrect signature for RTree");
    }
    this.stockShape = (V) OperationsParams.getShape(conf, "shape");

    if (conf.get(SpatialInputFormat3.InputQueryRange) != null) {
        // Retrieve the input query range to apply on all records
        this.inputQueryRange = OperationsParams.getShape(conf, SpatialInputFormat3.InputQueryRange);
        this.inputQueryMBR = this.inputQueryRange.getMBR();
    }

    // Check if there is an associated global index to read cell boundaries
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, path.getParent());
    if (gindex == null) {
        cellMBR = new Partition();
        cellMBR.invalidate();
    } else {
        // Set from the associated partition in the global index
        for (Partition p : gindex) {
            if (p.filename.equals(this.path.getName()))
                cellMBR = p;
        }
    }
}