List of usage examples for org.apache.hadoop.mapreduce.lib.input LineRecordReader nextKeyValue
public boolean nextKeyValue() throws IOException
From source file:edu.umn.cs.spatialHadoop.operations.Head.java
License:Open Source License
/** * Reads a maximum of n lines from the given file. * @param fs//from www . j a v a2 s . c o m * @param p * @param n * @return * @throws IOException */ public static String[] head(FileSystem fs, Path p, int n) throws IOException { String[] lines = new String[n]; FileStatus fstatus = fs.getFileStatus(p); TaskAttemptContext context = createDummyContext(); LineRecordReader lineReader = new LineRecordReader(); FileSplit split; if (p.getName().endsWith(".rtree")) { // R-tree indexed file FSDataInputStream in = fs.open(p); in.skip(8); // Skip R-tree signature int treeHeaderSize = RTree.skipHeader(in); in.close(); split = new FileSplit(p, treeHeaderSize + 8, fstatus.getLen() - treeHeaderSize - 8, new String[0]); } else { split = new FileSplit(p, 0, fstatus.getLen(), new String[0]); } lineReader.initialize(split, context); int numOfLines = 0; for (numOfLines = 0; numOfLines < lines.length && lineReader.nextKeyValue(); numOfLines++) { lines[numOfLines] = lineReader.getCurrentValue().toString(); } lineReader.close(); return lines; }
From source file:example.TestLineRecordReader.java
License:Apache License
private void testSplitRecordsForFile(Configuration conf, long firstSplitLength, long testFileSize, Path testFilePath) throws IOException { conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE); assertTrue("unexpected test data at " + testFilePath, testFileSize > firstSplitLength); String delimiter = conf.get("textinputformat.record.delimiter"); byte[] recordDelimiterBytes = null; if (null != delimiter) { recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8); }/*from w ww . jav a 2 s.c o m*/ TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); // read the data without splitting to count the records FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (String[]) null); LineRecordReader reader = new LineRecordReader(recordDelimiterBytes); reader.initialize(split, context); int numRecordsNoSplits = 0; while (reader.nextKeyValue()) { ++numRecordsNoSplits; } reader.close(); // count the records in the first split split = new FileSplit(testFilePath, 0, firstSplitLength, (String[]) null); reader = new LineRecordReader(recordDelimiterBytes); reader.initialize(split, context); int numRecordsFirstSplit = 0; while (reader.nextKeyValue()) { ++numRecordsFirstSplit; } reader.close(); // count the records in the second split split = new FileSplit(testFilePath, firstSplitLength, testFileSize - firstSplitLength, (String[]) null); reader = new LineRecordReader(recordDelimiterBytes); reader.initialize(split, context); int numRecordsRemainingSplits = 0; while (reader.nextKeyValue()) { ++numRecordsRemainingSplits; } reader.close(); assertEquals("Unexpected number of records in split ", numRecordsNoSplits, numRecordsFirstSplit + numRecordsRemainingSplits); }
From source file:example.TestLineRecordReader.java
License:Apache License
public ArrayList<String> readRecords(URL testFileUrl, int splitSize) throws IOException { // Set up context File testFile = new File(testFileUrl.getFile()); long testFileSize = testFile.length(); Path testFilePath = new Path(testFile.getAbsolutePath()); Configuration conf = new Configuration(); conf.setInt("io.file.buffer.size", 1); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); // Gather the records returned by the record reader ArrayList<String> records = new ArrayList<String>(); long offset = 0; while (offset < testFileSize) { FileSplit split = new FileSplit(testFilePath, offset, splitSize, null); LineRecordReader reader = new LineRecordReader(); reader.initialize(split, context); while (reader.nextKeyValue()) { records.add(reader.getCurrentValue().toString()); }//from ww w . j a va 2s. c o m offset += splitSize; } return records; }
From source file:example.TestLineRecordReader.java
License:Apache License
@Test public void testStripBOM() throws IOException { // the test data contains a BOM at the start of the file // confirm the BOM is skipped by LineRecordReader String UTF8_BOM = "\uFEFF"; URL testFileUrl = getClass().getClassLoader().getResource("testBOM.txt"); assertNotNull("Cannot find testBOM.txt", testFileUrl); File testFile = new File(testFileUrl.getFile()); Path testFilePath = new Path(testFile.getAbsolutePath()); long testFileSize = testFile.length(); Configuration conf = new Configuration(); conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); // read the data and check whether BOM is skipped FileSplit split = new FileSplit(testFilePath, 0, testFileSize, (String[]) null); LineRecordReader reader = new LineRecordReader(); reader.initialize(split, context);/* w w w . j a va 2 s . com*/ int numRecords = 0; boolean firstLine = true; boolean skipBOM = true; while (reader.nextKeyValue()) { if (firstLine) { firstLine = false; if (reader.getCurrentValue().toString().startsWith(UTF8_BOM)) { skipBOM = false; } } ++numRecords; } reader.close(); assertTrue("BOM is not skipped", skipBOM); }
From source file:example.TestLineRecordReader.java
License:Apache License
@Test public void testMultipleClose() throws IOException { URL testFileUrl = getClass().getClassLoader().getResource("recordSpanningMultipleSplits.txt.bz2"); assertNotNull("Cannot find recordSpanningMultipleSplits.txt.bz2", testFileUrl); File testFile = new File(testFileUrl.getFile()); Path testFilePath = new Path(testFile.getAbsolutePath()); long testFileSize = testFile.length(); Configuration conf = new Configuration(); conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); // read the data and check whether BOM is skipped FileSplit split = new FileSplit(testFilePath, 0, testFileSize, null); LineRecordReader reader = new LineRecordReader(); reader.initialize(split, context);//from ww w . ja v a2 s . c o m //noinspection StatementWithEmptyBody while (reader.nextKeyValue()) ; reader.close(); reader.close(); BZip2Codec codec = new BZip2Codec(); codec.setConf(conf); Set<Decompressor> decompressors = new HashSet<Decompressor>(); for (int i = 0; i < 10; ++i) { decompressors.add(CodecPool.getDecompressor(codec)); } assertEquals(10, decompressors.size()); }
From source file:example.TestLineRecordReader.java
License:Apache License
@Test public void testUncompressedInputCustomDelimiterPosValue() throws Exception { Configuration conf = new Configuration(); conf.setInt("io.file.buffer.size", 10); conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE); String inputData = "abcdefghij++kl++mno"; Path inputFile = createInputFile(conf, inputData); String delimiter = "++"; byte[] recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8); int splitLength = 15; FileSplit split = new FileSplit(inputFile, 0, splitLength, (String[]) null); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); LineRecordReader reader = new LineRecordReader(recordDelimiterBytes); reader.initialize(split, context);/*w w w .j a va 2s . c om*/ // Get first record: "abcdefghij" assertTrue("Expected record got nothing", reader.nextKeyValue()); LongWritable key = reader.getCurrentKey(); Text value = reader.getCurrentValue(); assertEquals("Wrong length for record value", 10, value.getLength()); assertEquals("Wrong position after record read", 0, key.get()); // Get second record: "kl" assertTrue("Expected record got nothing", reader.nextKeyValue()); assertEquals("Wrong length for record value", 2, value.getLength()); // Key should be 12 right after "abcdefghij++" assertEquals("Wrong position after record read", 12, key.get()); // Get third record: "mno" assertTrue("Expected record got nothing", reader.nextKeyValue()); assertEquals("Wrong length for record value", 3, value.getLength()); // Key should be 16 right after "abcdefghij++kl++" assertEquals("Wrong position after record read", 16, key.get()); assertFalse(reader.nextKeyValue()); // Key should be 19 right after "abcdefghij++kl++mno" assertEquals("Wrong position after record read", 19, key.get()); // after refresh should be empty key = reader.getCurrentKey(); assertNull("Unexpected key returned", key); reader.close(); split = new FileSplit(inputFile, splitLength, inputData.length() - splitLength, (String[]) null); reader = new LineRecordReader(recordDelimiterBytes); reader.initialize(split, context); // No record is in the second split because the second split dropped // the first record, which was already reported by the first split. assertFalse("Unexpected record returned", reader.nextKeyValue()); key = reader.getCurrentKey(); assertNull("Unexpected key returned", key); reader.close(); // multi char delimiter with starting part of the delimiter in the data inputData = "abcd+efgh++ijk++mno"; inputFile = createInputFile(conf, inputData); splitLength = 5; split = new FileSplit(inputFile, 0, splitLength, (String[]) null); reader = new LineRecordReader(recordDelimiterBytes); reader.initialize(split, context); // Get first record: "abcd+efgh" assertTrue("Expected record got nothing", reader.nextKeyValue()); key = reader.getCurrentKey(); value = reader.getCurrentValue(); assertEquals("Wrong position after record read", 0, key.get()); assertEquals("Wrong length for record value", 9, value.getLength()); // should have jumped over the delimiter, no record assertFalse(reader.nextKeyValue()); assertEquals("Wrong position after record read", 11, key.get()); // after refresh should be empty key = reader.getCurrentKey(); assertNull("Unexpected key returned", key); reader.close(); // next split: check for duplicate or dropped records split = new FileSplit(inputFile, splitLength, inputData.length() - splitLength, (String[]) null); reader = new LineRecordReader(recordDelimiterBytes); reader.initialize(split, context); assertTrue("Expected record got nothing", reader.nextKeyValue()); key = reader.getCurrentKey(); value = reader.getCurrentValue(); // Get second record: "ijk" first in this split assertEquals("Wrong position after record read", 11, key.get()); assertEquals("Wrong length for record value", 3, value.getLength()); // Get third record: "mno" second in this split assertTrue("Expected record got nothing", reader.nextKeyValue()); assertEquals("Wrong position after record read", 16, key.get()); assertEquals("Wrong length for record value", 3, value.getLength()); // should be at the end of the input assertFalse(reader.nextKeyValue()); assertEquals("Wrong position after record read", 19, key.get()); reader.close(); inputData = "abcd|efgh|+|ij|kl|+|mno|pqr"; inputFile = createInputFile(conf, inputData); delimiter = "|+|"; recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8); // walking over the buffer and split sizes checks for proper processing // of the ambiguous bytes of the delimiter for (int bufferSize = 1; bufferSize <= inputData.length(); bufferSize++) { for (int splitSize = 1; splitSize < inputData.length(); splitSize++) { // track where we are in the inputdata int keyPosition = 0; conf.setInt("io.file.buffer.size", bufferSize); split = new FileSplit(inputFile, 0, bufferSize, (String[]) null); reader = new LineRecordReader(recordDelimiterBytes); reader.initialize(split, context); // Get the first record: "abcd|efgh" always possible assertTrue("Expected record got nothing", reader.nextKeyValue()); key = reader.getCurrentKey(); value = reader.getCurrentValue(); assertTrue("abcd|efgh".equals(value.toString())); // Position should be 0 right at the start assertEquals("Wrong position after record read", keyPosition, key.get()); // Position should be 12 right after the first "|+|" keyPosition = 12; // get the next record: "ij|kl" if the split/buffer allows it if (reader.nextKeyValue()) { // check the record info: "ij|kl" assertTrue("ij|kl".equals(value.toString())); assertEquals("Wrong position after record read", keyPosition, key.get()); // Position should be 20 after the second "|+|" keyPosition = 20; } // get the third record: "mno|pqr" if the split/buffer allows it if (reader.nextKeyValue()) { // check the record info: "mno|pqr" assertTrue("mno|pqr".equals(value.toString())); assertEquals("Wrong position after record read", keyPosition, key.get()); // Position should be the end of the input keyPosition = inputData.length(); } assertFalse("Unexpected record returned", reader.nextKeyValue()); // no more records can be read we should be at the last position assertEquals("Wrong position after record read", keyPosition, key.get()); // after refresh should be empty key = reader.getCurrentKey(); assertNull("Unexpected key returned", key); reader.close(); } } }
From source file:example.TestLineRecordReader.java
License:Apache License
@Test public void testUncompressedInputDefaultDelimiterPosValue() throws Exception { Configuration conf = new Configuration(); String inputData = "1234567890\r\n12\r\n345"; Path inputFile = createInputFile(conf, inputData); conf.setInt("io.file.buffer.size", 10); conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE); FileSplit split = new FileSplit(inputFile, 0, 15, (String[]) null); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); LineRecordReader reader = new LineRecordReader(null); reader.initialize(split, context);//from ww w. j ava2 s.c om LongWritable key; Text value; reader.nextKeyValue(); key = reader.getCurrentKey(); value = reader.getCurrentValue(); // Get first record:"1234567890" assertEquals(10, value.getLength()); assertEquals(0, key.get()); reader.nextKeyValue(); // Get second record:"12" assertEquals(2, value.getLength()); // Key should be 12 right after "1234567890\r\n" assertEquals(12, key.get()); assertFalse(reader.nextKeyValue()); // Key should be 16 right after "1234567890\r\n12\r\n" assertEquals(16, key.get()); split = new FileSplit(inputFile, 15, 4, (String[]) null); reader = new LineRecordReader(null); reader.initialize(split, context); // The second split dropped the first record "\n" reader.nextKeyValue(); key = reader.getCurrentKey(); value = reader.getCurrentValue(); // Get third record:"345" assertEquals(3, value.getLength()); // Key should be 16 right after "1234567890\r\n12\r\n" assertEquals(16, key.get()); assertFalse(reader.nextKeyValue()); // Key should be 19 right after "1234567890\r\n12\r\n345" assertEquals(19, key.get()); inputData = "123456789\r\r\n"; inputFile = createInputFile(conf, inputData); split = new FileSplit(inputFile, 0, 12, (String[]) null); reader = new LineRecordReader(null); reader.initialize(split, context); reader.nextKeyValue(); key = reader.getCurrentKey(); value = reader.getCurrentValue(); // Get first record:"123456789" assertEquals(9, value.getLength()); assertEquals(0, key.get()); reader.nextKeyValue(); // Get second record:"" assertEquals(0, value.getLength()); // Key should be 10 right after "123456789\r" assertEquals(10, key.get()); assertFalse(reader.nextKeyValue()); // Key should be 12 right after "123456789\r\r\n" assertEquals(12, key.get()); }