List of usage examples for org.apache.hadoop.io Text getLength
@Override public int getLength()
From source file:org.commoncrawl.mapred.ec2.parser.ParserMapper.java
License:Open Source License
@Override public void map(Text url, CrawlURL value, OutputCollector<Text, ParseOutput> output, Reporter reporter) throws IOException { if (url.getLength() == 0) { LOG.error("Hit NULL URL. Original URL:" + value.getRedirectURL()); return;// www.j a va 2 s.co m } try { // allocate parse output ParseOutput parseOutput = new ParseOutput(); // json object out ... JsonObject jsonObj = new JsonObject(); // and create a crawl metadata CrawlMetadata metadata = parseOutput.getCrawlMetadata(); // and content (if available) ... Pair<String, Pair<TextBytes, FlexBuffer>> contentOut = null; URL originalURL = null; try { originalURL = new URL(url.toString()); } catch (MalformedURLException e) { LOG.error("Malformed URL:" + CCStringUtils.stringifyException(e)); reporter.incrCounter(Counters.MALFORMED_FINAL_URL, 1); return; } URL finalURL = originalURL; jsonObj.addProperty("attempt_time", value.getLastAttemptTime()); metadata.setAttemptTime(value.getLastAttemptTime()); // first step write status jsonObj.addProperty("disposition", (value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? "SUCCESS" : "FAILURE"); metadata.setCrawlDisposition( (byte) ((value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0 : 1)); // deal with redirects ... if ((value.getFlags() & CrawlURL.Flags.IsRedirected) != 0) { Pair<URL, JsonObject> redirect = buildRedirectObject(originalURL, value, metadata, reporter); jsonObj.add("redirect_from", redirect.e1); finalURL = redirect.e0; } if (value.getLastAttemptResult() == CrawlURL.CrawlResult.FAILURE) { jsonObj.addProperty("failure_reason", CrawlURL.FailureReason.toString(value.getLastAttemptFailureReason())); metadata.setFailureReason(value.getLastAttemptFailureReason()); jsonObj.addProperty("failure_detail", value.getLastAttemptFailureDetail()); metadata.setFailureDetail(value.getLastAttemptFailureDetail()); } else { jsonObj.addProperty("server_ip", IPAddressUtils.IntegerToIPAddressString(value.getServerIP())); metadata.setServerIP(value.getServerIP()); jsonObj.addProperty("http_result", value.getResultCode()); metadata.setHttpResult(value.getResultCode()); jsonObj.add("http_headers", httpHeadersToJsonObject(NIOHttpHeaders.parseHttpHeaders(value.getHeaders()))); metadata.setHttpHeaders(value.getHeaders()); jsonObj.addProperty("content_len", value.getContentRaw().getCount()); metadata.setContentLength(value.getContentRaw().getCount()); if (value.getResultCode() >= 200 && value.getResultCode() <= 299 && value.getContentRaw().getCount() > 0) { contentOut = populateContentMetadata(finalURL, value, reporter, jsonObj, metadata); } } // ok ... write stuff out ... reporter.incrCounter(Counters.WROTE_METADATA_RECORD, 1); ////////////////////////////////////////////////////////////// // echo some stuff to parseOutput ... parseOutput.setMetadata(jsonObj.toString()); JsonElement mimeType = jsonObj.get("mime_type"); if (mimeType != null) { parseOutput.setNormalizedMimeType(mimeType.getAsString()); } JsonElement md5 = jsonObj.get("md5"); if (md5 != null) { MD5Hash hash = new MD5Hash(md5.getAsString()); byte[] bytes = hash.getDigest(); parseOutput.setMd5Hash(new FlexBuffer(bytes, 0, bytes.length)); } JsonElement simHash = jsonObj.get("text_simhash"); if (simHash != null) { parseOutput.setSimHash(simHash.getAsLong()); } parseOutput.setHostIPAddress(IPAddressUtils.IntegerToIPAddressString(value.getServerIP())); parseOutput.setFetchTime(value.getLastAttemptTime()); //////////////////////////////////////////////////////////// if (contentOut != null) { if (contentOut.e0 != null) { parseOutput.setTextContent(contentOut.e0); reporter.incrCounter(Counters.WROTE_TEXT_CONTENT, 1); } if (contentOut.e1 != null) { // directly set the text bytes ... parseOutput.getHeadersAsTextBytes().set(contentOut.e1.e0); // mark it dirty !!! parseOutput.setFieldDirty(ParseOutput.Field_HEADERS); // if content available ... if (contentOut.e1.e1 != null) { parseOutput.setRawContent(contentOut.e1.e1); } reporter.incrCounter(Counters.WROTE_RAW_CONTENT, 1); } } //buildCompactMetadata(parseOutput,jsonObj,urlMap); output.collect(new Text(finalURL.toString()), parseOutput); } catch (IOException e) { LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e)); reporter.incrCounter(Counters.GOT_UNHANDLED_IO_EXCEPTION, 1); //TODO:HACK //throw e; } catch (Exception e) { LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e)); reporter.incrCounter(Counters.GOT_UNHANDLED_RUNTIME_EXCEPTION, 1); //TODO: HACK //throw new IOException(e); } }
From source file:org.commoncrawl.util.JoinValue.java
License:Open Source License
public JoinValue(TextBytes tag, Text value) { _tag = tag;//from w ww .j a v a2 s . c o m _type = TEXT_TYPE_JOIN_VALUE; _textValue = new TextBytes(); _textValue.set(value.getBytes(), 0, value.getLength()); }
From source file:org.commoncrawl.util.shared.ArcFileReaderTests.java
License:Apache License
/** * test basic reader functionality by creating a mock ARCFile in memory and then reading it back and validating the contents... *//* www.j a v a 2 s .co m*/ @Test public void testReader() { DataOutputBuffer os = new DataOutputBuffer(); long timestamp = System.currentTimeMillis(); try { // write the ARC File into memory writeFirstRecord(os, "test", timestamp); List<TestRecord> records = buildTestRecords(BASIC_TEST_RECORD_COUNT); long testAttemptTime = System.currentTimeMillis(); for (TestRecord record : records) { NIOHttpHeaders headers = new NIOHttpHeaders(); for (int i = 0; i < record.headers.size(); ++i) { headers.set(record.headers.get(i).e0, record.headers.get(i).e1); } write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, headers, "text/html", MD5Hash.digest(record.data).toString(), 12345, testAttemptTime); } os.flush(); os.close(); final AtomicBoolean streamClosed = new AtomicBoolean(); // setup ArcFileReader to read the file InputStream in = new ByteArrayInputStream(os.getData(), 0, os.getLength()) { public synchronized int read(byte b[], int off, int len) { len = 1; return super.read(b, off, len); } public void close() throws IOException { super.close(); streamClosed.set(true); } }; ARCFileReader reader = new ARCFileReader(in); int index = 0; Text key = new Text(); BytesWritable value = new BytesWritable(); // iterate and validate stuff ... while (reader.hasMoreItems()) { reader.nextKeyValue(key, value); TestRecord testRecord = records.get(index++); // get test key bytes as utf-8 bytes ... byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8")); // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters // with ?, which causes our test case (which does use invalid characters to from the key, to break. Assert.assertTrue( compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0); // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator // we search for this specific byte pattern to locate start of content, then compare it against source ... int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(), "\r\n\r\n".getBytes()); if (indexofHeaderTerminator == -1) { throw new IOException("No Header Terminator found in Value!"); } indexofHeaderTerminator += 4; // read headers ... String headersText = new String(value.getBytes(), 0, indexofHeaderTerminator, Charset.forName("UTF-8")); NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headersText); for (int i = 0; i < testRecord.headers.size(); ++i) { Pair<String, String> testHeaderRecord = testRecord.headers.get(i); Assert.assertNotNull(headers.findValue(testHeaderRecord.e0)); Assert.assertEquals(testHeaderRecord.e1, headers.findValue(testHeaderRecord.e0)); } Assert.assertTrue(compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0); } reader.close(); Assert.assertEquals(index, BASIC_TEST_RECORD_COUNT); Assert.assertTrue(streamClosed.get()); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } }
From source file:org.commoncrawl.util.TextBytes.java
License:Open Source License
/** copy a text. */ public void set(Text other) { set(other.getBytes(), 0, other.getLength()); }
From source file:org.commoncrawl.util.URLUtils.java
License:Open Source License
public static String getHostNameFromURLKey(Text key) { fastGetResult result = fastGetHostFromTextURL(key.getBytes(), 0, key.getLength()); if (result != null && result.length != 0) { String hostName = new String(key.getBytes(), result.offset, result.length); return hostName; }//from w w w . java2 s.co m return null; }
From source file:org.elasticsearch.hadoop.mr.MapReduceWriter.java
License:Apache License
@SuppressWarnings("unchecked") public boolean write(Writable writable, Generator generator) { if (writable == null || writable instanceof NullWritable) { generator.writeNull();//from w w w. j a va 2s. com } else if (writable instanceof Text) { Text text = (Text) writable; generator.writeUTF8String(text.getBytes(), 0, text.getLength()); } else if (writable instanceof UTF8) { UTF8 utf8 = (UTF8) writable; generator.writeUTF8String(utf8.getBytes(), 0, utf8.getLength()); } else if (writable instanceof IntWritable) { generator.writeNumber(((IntWritable) writable).get()); } else if (writable instanceof LongWritable) { generator.writeNumber(((LongWritable) writable).get()); } else if (writable instanceof VLongWritable) { generator.writeNumber(((VLongWritable) writable).get()); } else if (writable instanceof VIntWritable) { generator.writeNumber(((VIntWritable) writable).get()); } else if (writable instanceof ByteWritable) { generator.writeNumber(((ByteWritable) writable).get()); } else if (writable instanceof DoubleWritable) { generator.writeNumber(((DoubleWritable) writable).get()); } else if (writable instanceof FloatWritable) { generator.writeNumber(((FloatWritable) writable).get()); } else if (writable instanceof BooleanWritable) { generator.writeBoolean(((BooleanWritable) writable).get()); } else if (writable instanceof BytesWritable) { BytesWritable bw = (BytesWritable) writable; generator.writeBinary(bw.getBytes(), 0, bw.getLength()); } else if (writable instanceof MD5Hash) { generator.writeString(writable.toString()); } else if (writable instanceof ArrayWritable) { generator.writeBeginArray(); for (Writable wrt : ((ArrayWritable) writable).get()) { if (!write(wrt, generator)) { return false; } } generator.writeEndArray(); } else if (writable instanceof AbstractMapWritable) { Map<Writable, Writable> map = (Map<Writable, Writable>) writable; generator.writeBeginObject(); // ignore handling sets (which are just maps with null values) for (Entry<Writable, Writable> entry : map.entrySet()) { generator.writeFieldName(entry.getKey().toString()); if (!write(entry.getValue(), generator)) { return false; } } generator.writeEndObject(); } else { if (writeUnknownTypes) { return handleUnknown(writable, generator); } return false; } return true; }
From source file:org.elasticsearch.hadoop.mr.SafeWritableConverter.java
License:Apache License
public void invoke(Object from, BytesArray to) { // handle common cases if (from instanceof Text) { Text t = (Text) from; to.bytes(t.getBytes(), t.getLength()); }/* www.ja va 2 s . c o m*/ if (from instanceof BytesWritable) { BytesWritable b = (BytesWritable) from; to.bytes(b.getBytes(), b.getLength()); } }
From source file:org.elasticsearch.hadoop.mr.WritableBytesConverter.java
License:Apache License
@Override public void convert(Object from, BytesArray to) { // handle common cases if (from instanceof Text) { Text t = (Text) from; to.bytes(t.getBytes(), t.getLength()); return;/*from www. j a v a2s . c o m*/ } if (from instanceof BytesWritable) { BytesWritable b = (BytesWritable) from; to.bytes(b.getBytes(), b.getLength()); return; } super.convert(from, to); }
From source file:org.elasticsearch.hadoop.mr.WritableValueWriter.java
License:Apache License
@Override @SuppressWarnings({ "unchecked", "deprecation" }) public Result write(Writable writable, Generator generator) { if (writable == null || writable instanceof NullWritable) { generator.writeNull();// ww w. j a va2s .com } else if (writable instanceof Text) { Text text = (Text) writable; generator.writeUTF8String(text.getBytes(), 0, text.getLength()); } else if (writable instanceof UTF8) { UTF8 utf8 = (UTF8) writable; generator.writeUTF8String(utf8.getBytes(), 0, utf8.getLength()); } else if (WritableCompatUtil.isShortWritable(writable)) { generator.writeNumber(WritableCompatUtil.unwrap(writable)); } else if (writable instanceof IntWritable) { generator.writeNumber(((IntWritable) writable).get()); } else if (writable instanceof LongWritable) { generator.writeNumber(((LongWritable) writable).get()); } else if (writable instanceof VLongWritable) { generator.writeNumber(((VLongWritable) writable).get()); } else if (writable instanceof VIntWritable) { generator.writeNumber(((VIntWritable) writable).get()); } else if (writable instanceof ByteWritable) { generator.writeNumber(((ByteWritable) writable).get()); } else if (writable instanceof DoubleWritable) { generator.writeNumber(((DoubleWritable) writable).get()); } else if (writable instanceof FloatWritable) { generator.writeNumber(((FloatWritable) writable).get()); } else if (writable instanceof BooleanWritable) { generator.writeBoolean(((BooleanWritable) writable).get()); } else if (writable instanceof BytesWritable) { BytesWritable bw = (BytesWritable) writable; generator.writeBinary(bw.getBytes(), 0, bw.getLength()); } else if (writable instanceof MD5Hash) { generator.writeString(writable.toString()); } else if (writable instanceof ArrayWritable) { generator.writeBeginArray(); for (Writable wrt : ((ArrayWritable) writable).get()) { Result result = write(wrt, generator); if (!result.isSuccesful()) { return result; } } generator.writeEndArray(); } else if (writable instanceof AbstractMapWritable) { Map<Writable, Writable> map = (Map<Writable, Writable>) writable; generator.writeBeginObject(); // ignore handling sets (which are just maps with null values) for (Entry<Writable, Writable> entry : map.entrySet()) { String fieldName = entry.getKey().toString(); if (shouldKeep(generator.getParentPath(), fieldName)) { generator.writeFieldName(fieldName); Result result = write(entry.getValue(), generator); if (!result.isSuccesful()) { return result; } } } generator.writeEndObject(); } else { if (writeUnknownTypes) { return handleUnknown(writable, generator); } return Result.FAILED(writable); } return Result.SUCCESFUL(); }
From source file:org.gestore.hadoop.LongRecordReader.java
License:Apache License
/****** * Gets one complete entry//from ww w . j av a 2s . c o m */ private int getEntry(Pattern matcherStart, Pattern matcherStop) throws IOException { boolean started = false; boolean done = false; ByteBuffer newLine = ByteBuffer.allocate(2); newLine.putChar('\n'); byte[] newLineBytes = newLine.array(); Text tempLine = new Text(); int totalRead = 0; int newRead = 0; // Discard lines before start record match, save first line that matches regex while (!started) { if (lastLine.getLength() <= 0) { newRead = in.readLine(tempLine, maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength)); } else { tempLine = lastLine; newRead = lastLine.getLength(); lastLine = new Text(); } if (newRead == 0) { return 0; } totalRead += newRead; Matcher m = matcherStart.matcher(tempLine.toString()); if (m.matches()) { started = true; tempLine.append(newLineBytes, 0, newLineBytes.length); value.append(tempLine.getBytes(), 0, tempLine.getLength()); break; } } // Save lines until end record match, save last line while (!done) { newRead = in.readLine(tempLine, maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength)); if (newRead == 0) { return totalRead; } totalRead += newRead; Matcher m = matcherStop.matcher(tempLine.toString()); if (m.matches()) { done = true; lastLine = tempLine; return totalRead -= newRead; } tempLine.append(newLineBytes, 0, newLineBytes.length); value.append(tempLine.getBytes(), 0, tempLine.getLength()); } return totalRead; }