List of usage examples for org.apache.hadoop.io BytesWritable getLength
@Override public int getLength()
From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileRecordReaderTests.java
License:Apache License
@Test public void TestARCFileRecordReader() throws IOException, InterruptedException { Configuration conf = new Configuration(); FileSystem fs = LocalFileSystem.get(conf); Path path = new Path("/tmp/" + File.createTempFile("ARCRecordReader", "test")); List<TestRecord> records = ArcFileReaderTests.buildTestRecords(ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); FSDataOutputStream os = fs.create(path); try {/*w w w . j a va2 s . c o m*/ // write the ARC File into memory ArcFileReaderTests.writeFirstRecord(os, "test", System.currentTimeMillis()); long testAttemptTime = System.currentTimeMillis(); for (TestRecord record : records) { ArcFileReaderTests.write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, new NIOHttpHeaders(), "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); } os.flush(); } finally { os.close(); } FileSplit split = new FileSplit(path, 0, fs.getFileStatus(path).getLen(), new String[0]); ARCFileRecordReader reader = new ARCFileRecordReader(); reader.initialize(split, new TaskAttemptContext(conf, new TaskAttemptID())); int index = 0; // iterate and validate stuff ... while (reader.nextKeyValue()) { Text key = reader.getCurrentKey(); BytesWritable value = reader.getCurrentValue(); TestRecord testRecord = records.get(index++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); indexofHeaderTerminator += 4; Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(index, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT); fs.delete(path, false); }
From source file:org.commoncrawl.util.CompressedURLFPListV2.java
License:Open Source License
public static void main(String[] args) { // initialize ... final Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("core-site.xml"); conf.addResource("hdfs-site.xml"); conf.addResource("mapred-site.xml"); BasicConfigurator.configure();//from w w w .j a v a 2s . c om CrawlEnvironment.setHadoopConfig(conf); try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Path testFile = new Path("crawl/linkdb/merged1282844121161/linkData/part-00000"); SequenceFile.Reader reader = new SequenceFile.Reader(fs, testFile, conf); URLFPV2 fp = new URLFPV2(); BytesWritable bytes = new BytesWritable(); while (reader.next(fp, bytes)) { if (bytes.getLength() != 0) { DataInputBuffer inputStream = new DataInputBuffer(); inputStream.reset(bytes.get(), bytes.getLength()); CompressedURLFPListV2.Reader listReader = new CompressedURLFPListV2.Reader(inputStream); while (listReader.hasNext()) { URLFPV2 nextFP = listReader.next(); LOG.info("DH:" + nextFP.getDomainHash() + " UH:" + nextFP.getUrlHash()); } } else { LOG.error("ZERO BYTE LIST!"); } } reader.close(); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } if (1 == 1) return; validateDuplicateChecking(); // validateReallyBigList(); validateURLFPSerializationRootDomain(); validateURLFPSerializationSingleSubDomain(); validateURLFPSerializationMultiDomain(); validateURLFPFlagSerializationRootDomain(); validateURLFPFlagSerializationMultipleSubDomains(); validateURLFPFlagSerializationOneSubDomain(); }
From source file:org.commoncrawl.util.shared.ARCFileReader.java
License:Apache License
public static void main(String[] args) throws IOException, URISyntaxException, InterruptedException { Configuration conf = new Configuration(); String path = null;//from w w w .j a v a 2 s.co m CommandLineParser parser = new GnuParser(); try { // parse the command line arguments CommandLine cmdLine = parser.parse(options, args); // get ARCFile Path path = cmdLine.getOptionValue("file"); // get optional config if (cmdLine.hasOption("conf")) { conf.addResource(new Path(cmdLine.getOptionValue("conf"))); } if (cmdLine.hasOption("awsAccessKey")) { conf.set("fs.s3n.awsAccessKeyId", cmdLine.getOptionValue("awsAccessKey")); } if (cmdLine.hasOption("awsSecret")) { conf.set("fs.s3n.awsSecretAccessKey", cmdLine.getOptionValue("awsSecret")); } } catch (ParseException e) { System.out.println(e.toString()); printUsage(); System.exit(1); } final URI uri = new URI(path); FileSystem fs = FileSystem.get(uri, conf); // byte data[] = new byte[4096*10]; // int readAmt = 0; // while ((readAmt = stream.get().read(data)) != -1) { // System.out.println(HexDump.dumpHexString(data, 0, readAmt)); // } // stream.get().close(); // System.exit(1); ARCFileReader reader = null; try { System.out.println("Initializing Reader for Path:" + uri); reader = new ARCFileReader(fs.open(new Path(path))); Text key = new Text(); BytesWritable value = new BytesWritable(); while (reader.hasMoreItems()) { reader.nextKeyValue(key, value); int indexOfTrailingCRLF = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); int headerLen = indexOfTrailingCRLF + 4; int contentLen = value.getLength() - headerLen; String outputStr = "Key:" + key.toString() + " HeaderLen:" + headerLen + " ContentLen:" + contentLen; System.out.println(outputStr); //String contentStr = new String(value.getBytes(),headerLen,contentLen,Charset.forName("ASCII")); //System.out.println(contentStr.substring(contentStr.length() - 20)); } System.out.println("Exiting Loop"); } catch (Exception e) { System.out.println(CCStringUtils.stringifyException(e)); LOG.error(CCStringUtils.stringifyException(e)); //throw new IOException(e); } finally { if (reader != null) { System.out.println("***Closing Reader"); reader.close(); } } }
From source file:org.commoncrawl.util.shared.ArcFileReaderTests.java
License:Apache License
/** * test basic reader functionality by creating a mock ARCFile in memory and then reading it back and validating the contents... *//*from w w w.j a v a2s. c o m*/ @Test public void testReader() { DataOutputBuffer os = new DataOutputBuffer(); long timestamp = System.currentTimeMillis(); try { // write the ARC File into memory writeFirstRecord(os, "test", timestamp); List<TestRecord> records = buildTestRecords(BASIC_TEST_RECORD_COUNT); long testAttemptTime = System.currentTimeMillis(); for (TestRecord record : records) { NIOHttpHeaders headers = new NIOHttpHeaders(); for (int i = 0; i < record.headers.size(); ++i) { headers.set(record.headers.get(i).e0, record.headers.get(i).e1); } write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, headers, "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); } os.flush(); os.close(); final AtomicBoolean streamClosed = new AtomicBoolean(); // setup ArcFileReader to read the file InputStream in = new ByteArrayInputStream(os.getData(), 0, os.getLength()) { public synchronized int read(byte b[], int off, int len) { len = 1; return super.read(b, off, len); } public void close() throws IOException { super.close(); streamClosed.set(true); } }; ARCFileReader reader = new ARCFileReader(in); int index = 0; Text key = new Text(); BytesWritable value = new BytesWritable(); // iterate and validate stuff ... while (reader.hasMoreItems()) { reader.nextKeyValue(key, value); TestRecord testRecord = records.get(index++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue( compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); if (indexofHeaderTerminator == -1) { throw new IOException("No Header Terminator found in Value!"); } indexofHeaderTerminator += 4; // read headers ... String headersText = new String(value.getBytes(), 0, indexofHeaderTerminator, Charset.forName("UTF-8")); NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headersText); for (int i = 0; i < testRecord.headers.size(); ++i) { Pair<String, String> testHeaderRecord = testRecord.headers.get(i); Assert.assertNotNull(headers.findValue(testHeaderRecord.e0)); Assert.assertEquals(testHeaderRecord.e1, headers.findValue(testHeaderRecord.e0)); } Assert.assertTrue(compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(index, BASIC_TEST_RECORD_COUNT); Assert.assertTrue(streamClosed.get()); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } }
From source file:org.elasticsearch.hadoop.mr.MapReduceWriter.java
License:Apache License
@SuppressWarnings("unchecked") public boolean write(Writable writable, Generator generator) { if (writable == null || writable instanceof NullWritable) { generator.writeNull();//from w w w . j a v a 2 s.c o m } else if (writable instanceof Text) { Text text = (Text) writable; generator.writeUTF8String(text.getBytes(), 0, text.getLength()); } else if (writable instanceof UTF8) { UTF8 utf8 = (UTF8) writable; generator.writeUTF8String(utf8.getBytes(), 0, utf8.getLength()); } else if (writable instanceof IntWritable) { generator.writeNumber(((IntWritable) writable).get()); } else if (writable instanceof LongWritable) { generator.writeNumber(((LongWritable) writable).get()); } else if (writable instanceof VLongWritable) { generator.writeNumber(((VLongWritable) writable).get()); } else if (writable instanceof VIntWritable) { generator.writeNumber(((VIntWritable) writable).get()); } else if (writable instanceof ByteWritable) { generator.writeNumber(((ByteWritable) writable).get()); } else if (writable instanceof DoubleWritable) { generator.writeNumber(((DoubleWritable) writable).get()); } else if (writable instanceof FloatWritable) { generator.writeNumber(((FloatWritable) writable).get()); } else if (writable instanceof BooleanWritable) { generator.writeBoolean(((BooleanWritable) writable).get()); } else if (writable instanceof BytesWritable) { BytesWritable bw = (BytesWritable) writable; generator.writeBinary(bw.getBytes(), 0, bw.getLength()); } else if (writable instanceof MD5Hash) { generator.writeString(writable.toString()); } else if (writable instanceof ArrayWritable) { generator.writeBeginArray(); for (Writable wrt : ((ArrayWritable) writable).get()) { if (!write(wrt, generator)) { return false; } } generator.writeEndArray(); } else if (writable instanceof AbstractMapWritable) { Map<Writable, Writable> map = (Map<Writable, Writable>) writable; generator.writeBeginObject(); // ignore handling sets (which are just maps with null values) for (Entry<Writable, Writable> entry : map.entrySet()) { generator.writeFieldName(entry.getKey().toString()); if (!write(entry.getValue(), generator)) { return false; } } generator.writeEndObject(); } else { if (writeUnknownTypes) { return handleUnknown(writable, generator); } return false; } return true; }
From source file:org.elasticsearch.hadoop.mr.SafeWritableConverter.java
License:Apache License
public void invoke(Object from, BytesArray to) { // handle common cases if (from instanceof Text) { Text t = (Text) from; to.bytes(t.getBytes(), t.getLength()); }/*from ww w . j a v a 2s .c o m*/ if (from instanceof BytesWritable) { BytesWritable b = (BytesWritable) from; to.bytes(b.getBytes(), b.getLength()); } }
From source file:org.elasticsearch.hadoop.mr.WritableBytesConverter.java
License:Apache License
@Override public void convert(Object from, BytesArray to) { // handle common cases if (from instanceof Text) { Text t = (Text) from; to.bytes(t.getBytes(), t.getLength()); return;/*from w w w . ja va 2s . c o m*/ } if (from instanceof BytesWritable) { BytesWritable b = (BytesWritable) from; to.bytes(b.getBytes(), b.getLength()); return; } super.convert(from, to); }
From source file:org.elasticsearch.hadoop.mr.WritableValueWriter.java
License:Apache License
@Override @SuppressWarnings({ "unchecked", "deprecation" }) public Result write(Writable writable, Generator generator) { if (writable == null || writable instanceof NullWritable) { generator.writeNull();// w w w . j a v a 2 s .com } else if (writable instanceof Text) { Text text = (Text) writable; generator.writeUTF8String(text.getBytes(), 0, text.getLength()); } else if (writable instanceof UTF8) { UTF8 utf8 = (UTF8) writable; generator.writeUTF8String(utf8.getBytes(), 0, utf8.getLength()); } else if (WritableCompatUtil.isShortWritable(writable)) { generator.writeNumber(WritableCompatUtil.unwrap(writable)); } else if (writable instanceof IntWritable) { generator.writeNumber(((IntWritable) writable).get()); } else if (writable instanceof LongWritable) { generator.writeNumber(((LongWritable) writable).get()); } else if (writable instanceof VLongWritable) { generator.writeNumber(((VLongWritable) writable).get()); } else if (writable instanceof VIntWritable) { generator.writeNumber(((VIntWritable) writable).get()); } else if (writable instanceof ByteWritable) { generator.writeNumber(((ByteWritable) writable).get()); } else if (writable instanceof DoubleWritable) { generator.writeNumber(((DoubleWritable) writable).get()); } else if (writable instanceof FloatWritable) { generator.writeNumber(((FloatWritable) writable).get()); } else if (writable instanceof BooleanWritable) { generator.writeBoolean(((BooleanWritable) writable).get()); } else if (writable instanceof BytesWritable) { BytesWritable bw = (BytesWritable) writable; generator.writeBinary(bw.getBytes(), 0, bw.getLength()); } else if (writable instanceof MD5Hash) { generator.writeString(writable.toString()); } else if (writable instanceof ArrayWritable) { generator.writeBeginArray(); for (Writable wrt : ((ArrayWritable) writable).get()) { Result result = write(wrt, generator); if (!result.isSuccesful()) { return result; } } generator.writeEndArray(); } else if (writable instanceof AbstractMapWritable) { Map<Writable, Writable> map = (Map<Writable, Writable>) writable; generator.writeBeginObject(); // ignore handling sets (which are just maps with null values) for (Entry<Writable, Writable> entry : map.entrySet()) { String fieldName = entry.getKey().toString(); if (shouldKeep(generator.getParentPath(), fieldName)) { generator.writeFieldName(fieldName); Result result = write(entry.getValue(), generator); if (!result.isSuccesful()) { return result; } } } generator.writeEndObject(); } else { if (writeUnknownTypes) { return handleUnknown(writable, generator); } return Result.FAILED(writable); } return Result.SUCCESFUL(); }
From source file:org.freeeed.mr.FreeEedReducer.java
License:Apache License
protected void processMap(MapWritable value) throws IOException, InterruptedException { columnMetadata.reinit();/*from ww w . j av a 2s . c o m*/ ++outputFileCount; DocumentMetadata allMetadata = getAllMetadata(value); Metadata standardMetadata = getStandardMetadata(allMetadata, outputFileCount); columnMetadata.addMetadata(standardMetadata); columnMetadata.addMetadata(allMetadata); // documents other than the first one in this loop are either duplicates or attachments if (first) { masterOutputFileCount = outputFileCount; } else { if (allMetadata.hasParent()) { columnMetadata.addMetadataValue(DocumentMetadataKeys.ATTACHMENT_PARENT, UPIFormat.format(masterOutputFileCount)); } else { columnMetadata.addMetadataValue(DocumentMetadataKeys.MASTER_DUPLICATE, UPIFormat.format(masterOutputFileCount)); } } //String uniqueId = allMetadata.getUniqueId(); String originalFileName = new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName(); // add the text to the text folder String documentText = allMetadata.get(DocumentMetadataKeys.DOCUMENT_TEXT); String textEntryName = ParameterProcessing.TEXT + "/" + UPIFormat.format(outputFileCount) + "_" + originalFileName + ".txt"; if (textEntryName != null) { zipFileWriter.addTextFile(textEntryName, documentText); } columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_TEXT, textEntryName); // add the native file to the native folder String nativeEntryName = ParameterProcessing.NATIVE + "/" + UPIFormat.format(outputFileCount) + "_" + originalFileName; BytesWritable bytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE)); if (bytesWritable != null) { // some large exception files are not passed zipFileWriter.addBinaryFile(nativeEntryName, bytesWritable.getBytes(), bytesWritable.getLength()); logger.trace("Processing file: {}", nativeEntryName); } columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_NATIVE, nativeEntryName); // add the pdf made from native to the PDF folder String pdfNativeEntryName = ParameterProcessing.PDF_FOLDER + "/" + UPIFormat.format(outputFileCount) + "_" + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName() + ".pdf"; BytesWritable pdfBytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE_AS_PDF)); if (pdfBytesWritable != null) { zipFileWriter.addBinaryFile(pdfNativeEntryName, pdfBytesWritable.getBytes(), pdfBytesWritable.getLength()); logger.trace("Processing file: {}", pdfNativeEntryName); } processHtmlContent(value, allMetadata, UPIFormat.format(outputFileCount)); // add exception to the exception folder String exception = allMetadata.get(DocumentMetadataKeys.PROCESSING_EXCEPTION); if (exception != null) { String exceptionEntryName = "exception/" + UPIFormat.format(outputFileCount) + "_" + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName(); if (bytesWritable != null) { zipFileWriter.addBinaryFile(exceptionEntryName, bytesWritable.getBytes(), bytesWritable.getLength()); } columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_EXCEPTION, exceptionEntryName); } // write this all to the reduce map //context.write(new Text(outputKey), new Text(columnMetadata.delimiterSeparatedValues())); // drop the key altogether, because it messes up the format - but put it in the value // TODO use NullWritable if (OsUtil.isNix()) { context.write(null, new Text(columnMetadata.delimiterSeparatedValues())); } // prepare for the next file with the same key, if there is any first = false; }
From source file:org.freeeed.mr.FreeEedReducer.java
License:Apache License
private void processHtmlContent(MapWritable value, Metadata allMetadata, String uniqueId) throws IOException { BytesWritable htmlBytesWritable = (BytesWritable) value .get(new Text(ParameterProcessing.NATIVE_AS_HTML_NAME)); if (htmlBytesWritable != null) { String htmlNativeEntryName = ParameterProcessing.HTML_FOLDER + "/" + uniqueId + "_" + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName() + ".html"; zipFileWriter.addBinaryFile(htmlNativeEntryName, htmlBytesWritable.getBytes(), htmlBytesWritable.getLength()); logger.trace("Processing file: {}", htmlNativeEntryName); }//w w w. ja v a 2s . com // get the list with other files part of the html output Text htmlFiles = (Text) value.get(new Text(ParameterProcessing.NATIVE_AS_HTML)); if (htmlFiles != null) { String fileNames = htmlFiles.toString(); String[] fileNamesArr = fileNames.split(","); for (String fileName : fileNamesArr) { String entry = ParameterProcessing.HTML_FOLDER + "/" + fileName; BytesWritable imageBytesWritable = (BytesWritable) value .get(new Text(ParameterProcessing.NATIVE_AS_HTML + "_" + fileName)); if (imageBytesWritable != null) { zipFileWriter.addBinaryFile(entry, imageBytesWritable.getBytes(), imageBytesWritable.getLength()); logger.trace("Processing file: {}", entry); } } } }