Example usage for org.apache.hadoop.fs FSDataInputStream seek

List of usage examples for org.apache.hadoop.fs FSDataInputStream seek

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FSDataInputStream seek.

Prototype

@Override
public void seek(long desired) throws IOException 

Source Link

Document

Seek to the given offset.

Usage

From source file:com.uber.hoodie.common.table.log.block.HoodieLogBlock.java

License:Apache License

/**
 * Read or Skip block content of a log block in the log file. Depends on lazy reading enabled in
 * {@link HoodieMergedLogRecordScanner}/*from   www.j a v  a 2  s  .com*/
 */
public static byte[] readOrSkipContent(FSDataInputStream inputStream, Integer contentLength,
        boolean readBlockLazily) throws IOException {
    byte[] content = null;
    if (!readBlockLazily) {
        // Read the contents in memory
        content = new byte[contentLength];
        inputStream.readFully(content, 0, contentLength);
    } else {
        // Seek to the end of the content block
        inputStream.seek(inputStream.getPos() + contentLength);
    }
    return content;
}

From source file:com.yolodata.tbana.hadoop.mapred.csv.CSVLineRecordReader.java

License:Open Source License

public void initialize(InputSplit genericSplit, JobConf conf) throws IOException {
    FileSplit split = (FileSplit) genericSplit;

    start = split.getStart();/* w w  w. j  ava2  s . c  om*/
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(conf);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (codec != null) {
        is = codec.createInputStream(fileIn);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            fileIn.seek(start);
        }
        is = fileIn;
    }

    this.pos = start;
    init(is, conf);
}

From source file:com.yolodata.tbana.hadoop.mapred.shuttl.ShuttlCSVInputFormat.java

License:Open Source License

private static long findEndOfLinePosition(FSDataInputStream in, long end) throws IOException {
    in.seek(end);
    int c;// w  w  w.ja  va2  s  .c o m
    String findNewLineBuffer = "";
    while ((c = in.read()) != -1) {
        char ch = (char) c;

        // a real new line is found!
        if (findNewLineBuffer == "\"\n" && ch != ',')
            return in.getPos();

        if (findNewLineBuffer == "\"" && ch == '\n') {
            findNewLineBuffer = findNewLineBuffer.concat("\n");
            continue;
        }

        if (ch == '\"' && findNewLineBuffer.length() == 0) {
            findNewLineBuffer = "\"";
            continue;
        }
        findNewLineBuffer = "";

    }
    return in.getPos();
}

From source file:com.yolodata.tbana.hadoop.mapred.shuttl.ShuttlCSVRecordReader.java

License:Open Source License

public void initialize(InputSplit genericSplit, JobConf conf) throws IOException {
    CsvSplit split = (CsvSplit) genericSplit;

    start = split.getStart();// w w w .  j  ava 2s  .  com
    end = start + split.getLength();
    final Path file = split.getFilepath();

    startKey = split.getKeyStart();
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream fileIn = fs.open(split.getFilepath());

    if (start != 0) {
        fileIn.seek(start);
    }
    this.is = fileIn;
    this.pos = start;

    createReader(is);

    if (split.isSkipHeader())
        next(null, null);
}

From source file:com.zjy.mongo.splitter.BSONSplitter.java

License:Apache License

/**
 * Get the position at which the BSONFileRecordReader should begin
 * iterating the given split. This may not be at the beginning of the split
 * if the splits were not calculated by BSONSplitter.
 *
 * @param split the FileSplit for which to find the starting position.
 * @return the position of the first complete document within the split.
 * @throws IOException when an error occurs while reading a file
 *///from  w w  w.  jav a2s. co  m
public synchronized long getStartingPositionForSplit(final FileSplit split) throws IOException {

    FileSystem fs = split.getPath().getFileSystem(getConf());
    FileStatus file = fs.getFileStatus(split.getPath());
    ArrayList<BSONFileSplit> splits;
    BSONFileSplit[] splitsArr;

    // Get splits calculated on document boundaries.
    if (MongoConfigUtil.getBSONReadSplits(getConf())) {
        // Use the splits file to load splits on document boundaries.
        try {
            // Try to use the existing splits file.
            loadSplitsFromSplitFile(file, getSplitsFilePath(file.getPath(), getConf()));
        } catch (NoSplitFileException e) {
            // Create a splits file from scratch.
            readSplitsForFile(file);
        }
        splits = getAllSplits();
    } else {
        // Can't use a splits file, so create splits from scratch.
        splits = (ArrayList<BSONFileSplit>) splitFile(file);
    }
    splitsArr = new BSONFileSplit[splits.size()];
    splits.toArray(splitsArr);

    // Get the first pre-calculated split occurring before the start of
    // the given split.
    long previousStart = split.getStart();
    long startIterating = 0;
    for (BSONFileSplit bfs : splitsArr) {
        if (bfs.getStart() >= split.getStart()) {
            startIterating = previousStart;
            break;
        }
        previousStart = bfs.getStart();
    }

    // Beginning at 'startIterating', jump to the first document that begins
    // at or beyond the given split.
    FSDataInputStream fsDataStream = null;
    long pos = startIterating;
    try {
        fsDataStream = fs.open(split.getPath());
        fsDataStream.seek(pos);
        while (pos < split.getStart()) {
            callback.reset();
            bsonDec.decode(fsDataStream, callback);
            pos = fsDataStream.getPos();
        }
    } finally {
        if (null != fsDataStream) {
            fsDataStream.close();
        }
    }

    return pos;
}

From source file:cosmos.mapred.LongLineRecordReader.java

License:Apache License

@Override
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();//from   www  .  j  a  v  a 2s. com
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    final CompressionCodec codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());
    boolean skipFirstLine = false;
    if (codec != null) {
        in = new LfLineReader(codec.createInputStream(fileIn), job);
        end = Long.MAX_VALUE;
    } else {
        if (start != 0) {
            skipFirstLine = true;
            --start;
            fileIn.seek(start);
        }
        in = new LfLineReader(fileIn, job);
    }
    if (skipFirstLine) { // skip first line and re-establish "start".
        start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }
    this.pos = start;
}

From source file:crunch.MaxTemperature.java

License:Apache License

  public static void main(String[] args) throws Exception {
  String uri = args[0];/*from www.ja  v a2s .c  o m*/
  Configuration conf = new Configuration();
  FileSystem fs = FileSystem.get(URI.create(uri), conf);
  FSDataInputStream in = null; // XXX FSDataInputStream is Seekable, pervious examples had InputStream (no seeking was done)
  try {
    in = fs.open(new Path(uri));
    IOUtils.copyBytes(in, System.out, 4096, false);
    in.seek(0); // go back to the start of the file XXX
    IOUtils.copyBytes(in, System.out, 4096, false);
  } finally {
    IOUtils.closeStream(in);
  }
}

From source file:cs480a2.yqiu.recSystem.mapreduce.input.SingleBookReader.java

/**
 * @param inputSplit/*from   w  ww .  j  a  v a2s  . co  m*/
 * @param context    the information about the task
 * @throws IOException
 * @throws InterruptedException
 */
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {

    FileSplit split = (FileSplit) inputSplit;
    Configuration configuration = context.getConfiguration();
    Path path = split.getPath();
    filename = path.getName();
    FileSystem fileSystem = path.getFileSystem(configuration);
    FSDataInputStream inputStream = fileSystem.open(path);
    lineReader = new LineReader(inputStream, configuration);

    //initial start point and end point
    start = split.getStart();
    end = start + split.getLength();

    inputStream.seek(start);
    if (start != 0) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }

    start += lineReader.readLine(currentLine);

    prepareToScanBook();
}

From source file:de.l3s.archivepig.enrich.Response.java

License:Open Source License

@Override
public void enrich(Tuple data, Tuple enrichment, Object... params) throws Exception {
    long size = get(data, "_record.size");
    long offset = get(data, "_record.offset");
    String filename = get(data, "_record.filename");
    String cdxFile = get(data, "_record.cdxFile");

    if (size < 0 || offset < 0)
        return;/*  w ww  .  jav a 2  s  .co  m*/

    FileSystem fs = FileSystem.get(UDFContext.getUDFContext().getJobConf());

    Deque<String> cdxSegments = new ArrayDeque<String>(Lists.reverse(list(cdxFile.split("\\/"))));
    cdxSegments.pop(); // remove filename
    String pathExtension = "";
    Path path = new Path(ArchiveLoader.dataPath(), pathExtension + filename);
    while (!fs.exists(path)) {
        if (cdxSegments.isEmpty()) {
            enrichment.append(new HashMap<String, String>());
            enrichment.append(new HashMap<String, String>());
            enrichment.append(null);
            return;
        }
        String cdxSegment = cdxSegments.pop();
        if (cdxSegment.endsWith(".har"))
            cdxSegment = cdxSegment.substring(0, cdxSegment.length() - 4);
        pathExtension = cdxSegment + "/" + pathExtension;
        path = new Path(ArchiveLoader.dataPath(), pathExtension + filename);
    }
    FSDataInputStream fsin = fs.open(path);
    fsin.seek(offset);
    InputStream in = fsin;

    ByteArrayOutputStream recordOutput = new ByteArrayOutputStream();
    try {
        try (BoundedInputStream boundedIn = new BoundedInputStream(in, size);
                ArchiveReader reader = ArchiveReaderFactory.get(filename, boundedIn, false);) {
            ArchiveRecord record;
            record = reader.get();

            ArchiveRecordHeader header = record.getHeader();
            enrichment.append(header.getHeaderFields());

            record.dump(recordOutput);
        } catch (Exception e) {
            return;
        } finally {
            in.close();
            recordOutput.close();
        }
    } catch (Exception e) {
        return;
    }

    try (InputStream httpResponse = new ByteArrayInputStream(recordOutput.toByteArray())) {
        // ALL COMMENTS ARE NEW VERSION VARIANTS FOR HTTP-CORE 4.3, currently in use 4.2.5
        //        SessionInputBufferImpl sessionInputBuffer = new SessionInputBufferImpl(new HttpTransportMetricsImpl(), 2048);
        //        sessionInputBuffer.bind(httpResponse);
        //        DefaultHttpResponseParserFactory responseParserFactory = new DefaultHttpResponseParserFactory();
        //        HttpMessageParser<HttpResponse> responseParser = responseParserFactory.create(sessionInputBuffer, MessageConstraints.DEFAULT);
        //        HttpResponse response = responseParser.parse();
        //        Header[] httpHeaders = response.getAllHeaders();

        HttpResponseParser parser = new HttpResponseParser();
        HttpResponse response = parser.parse(httpResponse);
        HttpHeaders httpHeaders = response.getHeaders();

        Map<String, String> httpHeadersMap = new HashMap<String, String>();
        for (HttpHeader httpHeader : httpHeaders) {
            httpHeadersMap.put(httpHeader.getName(), httpHeader.getValue());
        }
        enrichment.append(httpHeadersMap);

        //        byte[] payload = new byte[sessionInputBuffer.length()];
        //        sessionInputBuffer.read(payload);

        byte[] payload = IOUtils.toByteArray(response);

        enrichment.append(payload);

        //        HttpEntity entity = new ByteArrayEntity(payload);
        //        output.append(entity == null ? null : EntityUtils.toString(entity));
    } catch (Exception ignored) {
    }
}

From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java

License:Apache License

/** 
 * Reading a bunch of lines of file paths in a list.
 * The code in this method is redistributed from Hadoop LineRecordReader
 * //from  ww  w  . j a  v  a  2 s.  c om
 * @throws IOException 
 */
private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException {
    FileSplit fileSplit = (FileSplit) split;
    Path path = fileSplit.getPath();

    long begin = fileSplit.getStart();
    long end = begin + fileSplit.getLength();

    LOG.info("Reading paths in file " + path.getName());

    // First check the compression codec
    CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf);
    CompressionCodec codec = compressionCodec.getCodec(path);
    FSDataInputStream fis = fs.open(path);
    SplitLineReader in;

    Seekable filePosition;

    boolean compressed = false;
    Decompressor decompressor = null;
    if (null != codec) {
        compressed = true;
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis,
                    decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            in = new CompressedSplitLineReader(cIn, conf, (byte[]) null);
            begin = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null);
            filePosition = fis;
        }
    } else {
        fis.seek(begin);
        in = new SplitLineReader(fis, conf, (byte[]) null);
        filePosition = fis;
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (begin != 0) {
        begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end));
    }
    long pos = begin;

    int newSize = 0;
    final Text nextLine = new Text();
    paths = new ArrayList<>();
    while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) {

        if (pos == 0) {
            // Strip BOM(Byte Order Mark)
            // Text only support UTF-8, we only need to check UTF-8 BOM
            // (0xEF,0xBB,0xBF) at the start of the text stream.
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE);
            pos += newSize;
            int textLength = nextLine.getLength();
            byte[] textBytes = nextLine.getBytes();
            if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
                    && (textBytes[2] == (byte) 0xBF)) {
                // find UTF-8 BOM, strip it.
                LOG.info("Found UTF-8 BOM and skipped it");
                textLength -= 3;
                newSize -= 3;
                if (textLength > 0) {
                    // It may work to use the same buffer and 
                    // not do the copyBytes
                    textBytes = nextLine.copyBytes();
                    nextLine.set(textBytes, 3, textLength);
                } else {
                    nextLine.clear();
                }
            }
        } else {
            newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end));
            pos += newSize;
        }

        paths.add(nextLine.toString());
        // line too long. try again
        LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
    }

    try {
        if (in != null) {
            in.close();
        }
        if (fis != null) {
            fis.close();
        }
    } finally {
        if (decompressor != null) {
            CodecPool.returnDecompressor(decompressor);
        }
    }
}