List of usage examples for org.apache.hadoop.fs FSDataInputStream seek
@Override public void seek(long desired) throws IOException
From source file:com.uber.hoodie.common.table.log.block.HoodieLogBlock.java
License:Apache License
/** * Read or Skip block content of a log block in the log file. Depends on lazy reading enabled in * {@link HoodieMergedLogRecordScanner}/*from www.j a v a 2 s .com*/ */ public static byte[] readOrSkipContent(FSDataInputStream inputStream, Integer contentLength, boolean readBlockLazily) throws IOException { byte[] content = null; if (!readBlockLazily) { // Read the contents in memory content = new byte[contentLength]; inputStream.readFully(content, 0, contentLength); } else { // Seek to the end of the content block inputStream.seek(inputStream.getPos() + contentLength); } return content; }
From source file:com.yolodata.tbana.hadoop.mapred.csv.CSVLineRecordReader.java
License:Open Source License
public void initialize(InputSplit genericSplit, JobConf conf) throws IOException { FileSplit split = (FileSplit) genericSplit; start = split.getStart();/* w w w. j ava2 s . c om*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(conf); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(split.getPath()); if (codec != null) { is = codec.createInputStream(fileIn); end = Long.MAX_VALUE; } else { if (start != 0) { fileIn.seek(start); } is = fileIn; } this.pos = start; init(is, conf); }
From source file:com.yolodata.tbana.hadoop.mapred.shuttl.ShuttlCSVInputFormat.java
License:Open Source License
private static long findEndOfLinePosition(FSDataInputStream in, long end) throws IOException { in.seek(end); int c;// w w w.ja va2 s .c o m String findNewLineBuffer = ""; while ((c = in.read()) != -1) { char ch = (char) c; // a real new line is found! if (findNewLineBuffer == "\"\n" && ch != ',') return in.getPos(); if (findNewLineBuffer == "\"" && ch == '\n') { findNewLineBuffer = findNewLineBuffer.concat("\n"); continue; } if (ch == '\"' && findNewLineBuffer.length() == 0) { findNewLineBuffer = "\""; continue; } findNewLineBuffer = ""; } return in.getPos(); }
From source file:com.yolodata.tbana.hadoop.mapred.shuttl.ShuttlCSVRecordReader.java
License:Open Source License
public void initialize(InputSplit genericSplit, JobConf conf) throws IOException { CsvSplit split = (CsvSplit) genericSplit; start = split.getStart();// w w w . j ava 2s . com end = start + split.getLength(); final Path file = split.getFilepath(); startKey = split.getKeyStart(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream fileIn = fs.open(split.getFilepath()); if (start != 0) { fileIn.seek(start); } this.is = fileIn; this.pos = start; createReader(is); if (split.isSkipHeader()) next(null, null); }
From source file:com.zjy.mongo.splitter.BSONSplitter.java
License:Apache License
/** * Get the position at which the BSONFileRecordReader should begin * iterating the given split. This may not be at the beginning of the split * if the splits were not calculated by BSONSplitter. * * @param split the FileSplit for which to find the starting position. * @return the position of the first complete document within the split. * @throws IOException when an error occurs while reading a file *///from w w w. jav a2s. co m public synchronized long getStartingPositionForSplit(final FileSplit split) throws IOException { FileSystem fs = split.getPath().getFileSystem(getConf()); FileStatus file = fs.getFileStatus(split.getPath()); ArrayList<BSONFileSplit> splits; BSONFileSplit[] splitsArr; // Get splits calculated on document boundaries. if (MongoConfigUtil.getBSONReadSplits(getConf())) { // Use the splits file to load splits on document boundaries. try { // Try to use the existing splits file. loadSplitsFromSplitFile(file, getSplitsFilePath(file.getPath(), getConf())); } catch (NoSplitFileException e) { // Create a splits file from scratch. readSplitsForFile(file); } splits = getAllSplits(); } else { // Can't use a splits file, so create splits from scratch. splits = (ArrayList<BSONFileSplit>) splitFile(file); } splitsArr = new BSONFileSplit[splits.size()]; splits.toArray(splitsArr); // Get the first pre-calculated split occurring before the start of // the given split. long previousStart = split.getStart(); long startIterating = 0; for (BSONFileSplit bfs : splitsArr) { if (bfs.getStart() >= split.getStart()) { startIterating = previousStart; break; } previousStart = bfs.getStart(); } // Beginning at 'startIterating', jump to the first document that begins // at or beyond the given split. FSDataInputStream fsDataStream = null; long pos = startIterating; try { fsDataStream = fs.open(split.getPath()); fsDataStream.seek(pos); while (pos < split.getStart()) { callback.reset(); bsonDec.decode(fsDataStream, callback); pos = fsDataStream.getPos(); } } finally { if (null != fsDataStream) { fsDataStream.close(); } } return pos; }
From source file:cosmos.mapred.LongLineRecordReader.java
License:Apache License
@Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();//from www . j a v a 2s. com end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LfLineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LfLineReader(fileIn, job); } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:crunch.MaxTemperature.java
License:Apache License
public static void main(String[] args) throws Exception { String uri = args[0];/*from www.ja v a2s .c o m*/ Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(URI.create(uri), conf); FSDataInputStream in = null; // XXX FSDataInputStream is Seekable, pervious examples had InputStream (no seeking was done) try { in = fs.open(new Path(uri)); IOUtils.copyBytes(in, System.out, 4096, false); in.seek(0); // go back to the start of the file XXX IOUtils.copyBytes(in, System.out, 4096, false); } finally { IOUtils.closeStream(in); } }
From source file:cs480a2.yqiu.recSystem.mapreduce.input.SingleBookReader.java
/** * @param inputSplit/*from w ww . j a v a2s . co m*/ * @param context the information about the task * @throws IOException * @throws InterruptedException */ @Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) inputSplit; Configuration configuration = context.getConfiguration(); Path path = split.getPath(); filename = path.getName(); FileSystem fileSystem = path.getFileSystem(configuration); FSDataInputStream inputStream = fileSystem.open(path); lineReader = new LineReader(inputStream, configuration); //initial start point and end point start = split.getStart(); end = start + split.getLength(); inputStream.seek(start); if (start != 0) { start += lineReader.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start)); } start += lineReader.readLine(currentLine); prepareToScanBook(); }
From source file:de.l3s.archivepig.enrich.Response.java
License:Open Source License
@Override public void enrich(Tuple data, Tuple enrichment, Object... params) throws Exception { long size = get(data, "_record.size"); long offset = get(data, "_record.offset"); String filename = get(data, "_record.filename"); String cdxFile = get(data, "_record.cdxFile"); if (size < 0 || offset < 0) return;/* w ww . jav a 2 s .co m*/ FileSystem fs = FileSystem.get(UDFContext.getUDFContext().getJobConf()); Deque<String> cdxSegments = new ArrayDeque<String>(Lists.reverse(list(cdxFile.split("\\/")))); cdxSegments.pop(); // remove filename String pathExtension = ""; Path path = new Path(ArchiveLoader.dataPath(), pathExtension + filename); while (!fs.exists(path)) { if (cdxSegments.isEmpty()) { enrichment.append(new HashMap<String, String>()); enrichment.append(new HashMap<String, String>()); enrichment.append(null); return; } String cdxSegment = cdxSegments.pop(); if (cdxSegment.endsWith(".har")) cdxSegment = cdxSegment.substring(0, cdxSegment.length() - 4); pathExtension = cdxSegment + "/" + pathExtension; path = new Path(ArchiveLoader.dataPath(), pathExtension + filename); } FSDataInputStream fsin = fs.open(path); fsin.seek(offset); InputStream in = fsin; ByteArrayOutputStream recordOutput = new ByteArrayOutputStream(); try { try (BoundedInputStream boundedIn = new BoundedInputStream(in, size); ArchiveReader reader = ArchiveReaderFactory.get(filename, boundedIn, false);) { ArchiveRecord record; record = reader.get(); ArchiveRecordHeader header = record.getHeader(); enrichment.append(header.getHeaderFields()); record.dump(recordOutput); } catch (Exception e) { return; } finally { in.close(); recordOutput.close(); } } catch (Exception e) { return; } try (InputStream httpResponse = new ByteArrayInputStream(recordOutput.toByteArray())) { // ALL COMMENTS ARE NEW VERSION VARIANTS FOR HTTP-CORE 4.3, currently in use 4.2.5 // SessionInputBufferImpl sessionInputBuffer = new SessionInputBufferImpl(new HttpTransportMetricsImpl(), 2048); // sessionInputBuffer.bind(httpResponse); // DefaultHttpResponseParserFactory responseParserFactory = new DefaultHttpResponseParserFactory(); // HttpMessageParser<HttpResponse> responseParser = responseParserFactory.create(sessionInputBuffer, MessageConstraints.DEFAULT); // HttpResponse response = responseParser.parse(); // Header[] httpHeaders = response.getAllHeaders(); HttpResponseParser parser = new HttpResponseParser(); HttpResponse response = parser.parse(httpResponse); HttpHeaders httpHeaders = response.getHeaders(); Map<String, String> httpHeadersMap = new HashMap<String, String>(); for (HttpHeader httpHeader : httpHeaders) { httpHeadersMap.put(httpHeader.getName(), httpHeader.getValue()); } enrichment.append(httpHeadersMap); // byte[] payload = new byte[sessionInputBuffer.length()]; // sessionInputBuffer.read(payload); byte[] payload = IOUtils.toByteArray(response); enrichment.append(payload); // HttpEntity entity = new ByteArrayEntity(payload); // output.append(entity == null ? null : EntityUtils.toString(entity)); } catch (Exception ignored) { } }
From source file:de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java
License:Apache License
/** * Reading a bunch of lines of file paths in a list. * The code in this method is redistributed from Hadoop LineRecordReader * //from ww w . j a v a 2 s. c om * @throws IOException */ private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); long begin = fileSplit.getStart(); long end = begin + fileSplit.getLength(); LOG.info("Reading paths in file " + path.getName()); // First check the compression codec CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf); CompressionCodec codec = compressionCodec.getCodec(path); FSDataInputStream fis = fs.open(path); SplitLineReader in; Seekable filePosition; boolean compressed = false; Decompressor decompressor = null; if (null != codec) { compressed = true; decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis, decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new CompressedSplitLineReader(cIn, conf, (byte[]) null); begin = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; } else { in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null); filePosition = fis; } } else { fis.seek(begin); in = new SplitLineReader(fis, conf, (byte[]) null); filePosition = fis; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (begin != 0) { begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end)); } long pos = begin; int newSize = 0; final Text nextLine = new Text(); paths = new ArrayList<>(); while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) { if (pos == 0) { // Strip BOM(Byte Order Mark) // Text only support UTF-8, we only need to check UTF-8 BOM // (0xEF,0xBB,0xBF) at the start of the text stream. newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE); pos += newSize; int textLength = nextLine.getLength(); byte[] textBytes = nextLine.getBytes(); if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB) && (textBytes[2] == (byte) 0xBF)) { // find UTF-8 BOM, strip it. LOG.info("Found UTF-8 BOM and skipped it"); textLength -= 3; newSize -= 3; if (textLength > 0) { // It may work to use the same buffer and // not do the copyBytes textBytes = nextLine.copyBytes(); nextLine.set(textBytes, 3, textLength); } else { nextLine.clear(); } } } else { newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end)); pos += newSize; } paths.add(nextLine.toString()); // line too long. try again LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize)); } try { if (in != null) { in.close(); } if (fis != null) { fis.close(); } } finally { if (decompressor != null) { CodecPool.returnDecompressor(decompressor); } } }