Example usage for org.apache.hadoop.io Text getLength

List of usage examples for org.apache.hadoop.io Text getLength

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text getLength.

Prototype

@Override
public int getLength() 

Source Link

Document

Returns the number of bytes in the byte array

Usage

From source file:org.commoncrawl.mapred.ec2.parser.ParserMapper.java

License:Open Source License

@Override
public void map(Text url, CrawlURL value, OutputCollector<Text, ParseOutput> output, Reporter reporter)
        throws IOException {

    if (url.getLength() == 0) {
        LOG.error("Hit NULL URL. Original URL:" + value.getRedirectURL());
        return;// www.j a va 2 s.co  m
    }

    try {
        // allocate parse output 
        ParseOutput parseOutput = new ParseOutput();
        // json object out ... 
        JsonObject jsonObj = new JsonObject();
        // and create a crawl metadata 
        CrawlMetadata metadata = parseOutput.getCrawlMetadata();

        // and content (if available) ... 
        Pair<String, Pair<TextBytes, FlexBuffer>> contentOut = null;

        URL originalURL = null;

        try {
            originalURL = new URL(url.toString());
        } catch (MalformedURLException e) {
            LOG.error("Malformed URL:" + CCStringUtils.stringifyException(e));
            reporter.incrCounter(Counters.MALFORMED_FINAL_URL, 1);
            return;
        }

        URL finalURL = originalURL;

        jsonObj.addProperty("attempt_time", value.getLastAttemptTime());
        metadata.setAttemptTime(value.getLastAttemptTime());

        // first step write status 
        jsonObj.addProperty("disposition",
                (value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? "SUCCESS" : "FAILURE");
        metadata.setCrawlDisposition(
                (byte) ((value.getLastAttemptResult() == CrawlURL.CrawlResult.SUCCESS) ? 0 : 1));

        // deal with redirects ... 
        if ((value.getFlags() & CrawlURL.Flags.IsRedirected) != 0) {
            Pair<URL, JsonObject> redirect = buildRedirectObject(originalURL, value, metadata, reporter);
            jsonObj.add("redirect_from", redirect.e1);
            finalURL = redirect.e0;
        }

        if (value.getLastAttemptResult() == CrawlURL.CrawlResult.FAILURE) {
            jsonObj.addProperty("failure_reason",
                    CrawlURL.FailureReason.toString(value.getLastAttemptFailureReason()));
            metadata.setFailureReason(value.getLastAttemptFailureReason());
            jsonObj.addProperty("failure_detail", value.getLastAttemptFailureDetail());
            metadata.setFailureDetail(value.getLastAttemptFailureDetail());
        } else {
            jsonObj.addProperty("server_ip", IPAddressUtils.IntegerToIPAddressString(value.getServerIP()));
            metadata.setServerIP(value.getServerIP());
            jsonObj.addProperty("http_result", value.getResultCode());
            metadata.setHttpResult(value.getResultCode());
            jsonObj.add("http_headers",
                    httpHeadersToJsonObject(NIOHttpHeaders.parseHttpHeaders(value.getHeaders())));
            metadata.setHttpHeaders(value.getHeaders());
            jsonObj.addProperty("content_len", value.getContentRaw().getCount());
            metadata.setContentLength(value.getContentRaw().getCount());
            if (value.getResultCode() >= 200 && value.getResultCode() <= 299
                    && value.getContentRaw().getCount() > 0) {
                contentOut = populateContentMetadata(finalURL, value, reporter, jsonObj, metadata);
            }
        }

        // ok ... write stuff out ...
        reporter.incrCounter(Counters.WROTE_METADATA_RECORD, 1);
        //////////////////////////////////////////////////////////////
        // echo some stuff to parseOutput ... 
        parseOutput.setMetadata(jsonObj.toString());
        JsonElement mimeType = jsonObj.get("mime_type");
        if (mimeType != null) {
            parseOutput.setNormalizedMimeType(mimeType.getAsString());
        }
        JsonElement md5 = jsonObj.get("md5");
        if (md5 != null) {
            MD5Hash hash = new MD5Hash(md5.getAsString());
            byte[] bytes = hash.getDigest();
            parseOutput.setMd5Hash(new FlexBuffer(bytes, 0, bytes.length));
        }
        JsonElement simHash = jsonObj.get("text_simhash");
        if (simHash != null) {
            parseOutput.setSimHash(simHash.getAsLong());
        }
        parseOutput.setHostIPAddress(IPAddressUtils.IntegerToIPAddressString(value.getServerIP()));
        parseOutput.setFetchTime(value.getLastAttemptTime());
        ////////////////////////////////////////////////////////////

        if (contentOut != null) {
            if (contentOut.e0 != null) {
                parseOutput.setTextContent(contentOut.e0);
                reporter.incrCounter(Counters.WROTE_TEXT_CONTENT, 1);
            }
            if (contentOut.e1 != null) {

                // directly set the text bytes ... 
                parseOutput.getHeadersAsTextBytes().set(contentOut.e1.e0);
                // mark it dirty !!!
                parseOutput.setFieldDirty(ParseOutput.Field_HEADERS);
                // if content available ... 
                if (contentOut.e1.e1 != null) {
                    parseOutput.setRawContent(contentOut.e1.e1);
                }
                reporter.incrCounter(Counters.WROTE_RAW_CONTENT, 1);
            }
        }

        //buildCompactMetadata(parseOutput,jsonObj,urlMap);

        output.collect(new Text(finalURL.toString()), parseOutput);
    } catch (IOException e) {
        LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e));
        reporter.incrCounter(Counters.GOT_UNHANDLED_IO_EXCEPTION, 1);
        //TODO:HACK
        //throw e;
    } catch (Exception e) {
        LOG.error("Exception Processing URL:" + url.toString() + "\n" + CCStringUtils.stringifyException(e));
        reporter.incrCounter(Counters.GOT_UNHANDLED_RUNTIME_EXCEPTION, 1);
        //TODO: HACK 
        //throw new IOException(e);
    }
}

From source file:org.commoncrawl.util.JoinValue.java

License:Open Source License

public JoinValue(TextBytes tag, Text value) {
    _tag = tag;//from  w  ww  .j  a v  a2 s  . c  o  m
    _type = TEXT_TYPE_JOIN_VALUE;
    _textValue = new TextBytes();
    _textValue.set(value.getBytes(), 0, value.getLength());
}

From source file:org.commoncrawl.util.shared.ArcFileReaderTests.java

License:Apache License

/** 
 * test basic reader functionality by creating a mock ARCFile in memory and then reading it back and validating the contents... 
 *//*  www.j a  v  a 2 s .co  m*/
@Test
public void testReader() {
    DataOutputBuffer os = new DataOutputBuffer();
    long timestamp = System.currentTimeMillis();
    try {
        // write the ARC File into memory 
        writeFirstRecord(os, "test", timestamp);
        List<TestRecord> records = buildTestRecords(BASIC_TEST_RECORD_COUNT);
        long testAttemptTime = System.currentTimeMillis();

        for (TestRecord record : records) {
            NIOHttpHeaders headers = new NIOHttpHeaders();
            for (int i = 0; i < record.headers.size(); ++i) {
                headers.set(record.headers.get(i).e0, record.headers.get(i).e1);
            }

            write(os, record.url, "test", 1, 1, record.data, 0, record.data.length, headers, "text/html",
                    MD5Hash.digest(record.data).toString(), 12345, testAttemptTime);
        }
        os.flush();
        os.close();

        final AtomicBoolean streamClosed = new AtomicBoolean();
        // setup ArcFileReader to read the file 
        InputStream in = new ByteArrayInputStream(os.getData(), 0, os.getLength()) {

            public synchronized int read(byte b[], int off, int len) {
                len = 1;
                return super.read(b, off, len);
            }

            public void close() throws IOException {
                super.close();
                streamClosed.set(true);
            }
        };
        ARCFileReader reader = new ARCFileReader(in);
        int index = 0;
        Text key = new Text();
        BytesWritable value = new BytesWritable();

        // iterate and validate stuff ... 
        while (reader.hasMoreItems()) {
            reader.nextKeyValue(key, value);
            TestRecord testRecord = records.get(index++);
            // get test key bytes as utf-8 bytes ... 
            byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
            // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
            // with ?, which causes our test case (which does use invalid characters to from the key, to break.
            Assert.assertTrue(
                    compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0, key.getLength()) == 0);
            // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
            // we search for this specific byte pattern to locate start of content, then compare it against source ... 
            int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                    "\r\n\r\n".getBytes());
            if (indexofHeaderTerminator == -1) {
                throw new IOException("No Header Terminator found in Value!");
            }
            indexofHeaderTerminator += 4;
            // read headers ... 
            String headersText = new String(value.getBytes(), 0, indexofHeaderTerminator,
                    Charset.forName("UTF-8"));
            NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(headersText);
            for (int i = 0; i < testRecord.headers.size(); ++i) {
                Pair<String, String> testHeaderRecord = testRecord.headers.get(i);
                Assert.assertNotNull(headers.findValue(testHeaderRecord.e0));
                Assert.assertEquals(testHeaderRecord.e1, headers.findValue(testHeaderRecord.e0));
            }

            Assert.assertTrue(compareTo(testRecord.data, 0, testRecord.data.length, value.getBytes(),
                    indexofHeaderTerminator, testRecord.data.length) == 0);
        }
        reader.close();

        Assert.assertEquals(index, BASIC_TEST_RECORD_COUNT);
        Assert.assertTrue(streamClosed.get());
    } catch (IOException e) {
        e.printStackTrace();
        throw new RuntimeException(e);
    }
}

From source file:org.commoncrawl.util.TextBytes.java

License:Open Source License

/** copy a text. */
public void set(Text other) {
    set(other.getBytes(), 0, other.getLength());
}

From source file:org.commoncrawl.util.URLUtils.java

License:Open Source License

public static String getHostNameFromURLKey(Text key) {

    fastGetResult result = fastGetHostFromTextURL(key.getBytes(), 0, key.getLength());

    if (result != null && result.length != 0) {
        String hostName = new String(key.getBytes(), result.offset, result.length);
        return hostName;
    }//from w  w w  .  java2 s.co  m
    return null;
}

From source file:org.elasticsearch.hadoop.mr.MapReduceWriter.java

License:Apache License

@SuppressWarnings("unchecked")
public boolean write(Writable writable, Generator generator) {
    if (writable == null || writable instanceof NullWritable) {
        generator.writeNull();//from  w  w w. j a va 2s.  com
    } else if (writable instanceof Text) {
        Text text = (Text) writable;
        generator.writeUTF8String(text.getBytes(), 0, text.getLength());
    } else if (writable instanceof UTF8) {
        UTF8 utf8 = (UTF8) writable;
        generator.writeUTF8String(utf8.getBytes(), 0, utf8.getLength());
    } else if (writable instanceof IntWritable) {
        generator.writeNumber(((IntWritable) writable).get());
    } else if (writable instanceof LongWritable) {
        generator.writeNumber(((LongWritable) writable).get());
    } else if (writable instanceof VLongWritable) {
        generator.writeNumber(((VLongWritable) writable).get());
    } else if (writable instanceof VIntWritable) {
        generator.writeNumber(((VIntWritable) writable).get());
    } else if (writable instanceof ByteWritable) {
        generator.writeNumber(((ByteWritable) writable).get());
    } else if (writable instanceof DoubleWritable) {
        generator.writeNumber(((DoubleWritable) writable).get());
    } else if (writable instanceof FloatWritable) {
        generator.writeNumber(((FloatWritable) writable).get());
    } else if (writable instanceof BooleanWritable) {
        generator.writeBoolean(((BooleanWritable) writable).get());
    } else if (writable instanceof BytesWritable) {
        BytesWritable bw = (BytesWritable) writable;
        generator.writeBinary(bw.getBytes(), 0, bw.getLength());
    } else if (writable instanceof MD5Hash) {
        generator.writeString(writable.toString());
    }

    else if (writable instanceof ArrayWritable) {
        generator.writeBeginArray();
        for (Writable wrt : ((ArrayWritable) writable).get()) {
            if (!write(wrt, generator)) {
                return false;
            }
        }
        generator.writeEndArray();
    }

    else if (writable instanceof AbstractMapWritable) {
        Map<Writable, Writable> map = (Map<Writable, Writable>) writable;

        generator.writeBeginObject();
        // ignore handling sets (which are just maps with null values)
        for (Entry<Writable, Writable> entry : map.entrySet()) {
            generator.writeFieldName(entry.getKey().toString());
            if (!write(entry.getValue(), generator)) {
                return false;
            }
        }
        generator.writeEndObject();
    } else {
        if (writeUnknownTypes) {
            return handleUnknown(writable, generator);
        }
        return false;
    }
    return true;
}

From source file:org.elasticsearch.hadoop.mr.SafeWritableConverter.java

License:Apache License

public void invoke(Object from, BytesArray to) {
    // handle common cases
    if (from instanceof Text) {
        Text t = (Text) from;
        to.bytes(t.getBytes(), t.getLength());
    }/* www.ja  va 2 s  . c  o  m*/
    if (from instanceof BytesWritable) {
        BytesWritable b = (BytesWritable) from;
        to.bytes(b.getBytes(), b.getLength());
    }
}

From source file:org.elasticsearch.hadoop.mr.WritableBytesConverter.java

License:Apache License

@Override
public void convert(Object from, BytesArray to) {
    // handle common cases
    if (from instanceof Text) {
        Text t = (Text) from;
        to.bytes(t.getBytes(), t.getLength());
        return;/*from   www. j a v  a2s  . c o  m*/
    }
    if (from instanceof BytesWritable) {
        BytesWritable b = (BytesWritable) from;
        to.bytes(b.getBytes(), b.getLength());
        return;
    }

    super.convert(from, to);
}

From source file:org.elasticsearch.hadoop.mr.WritableValueWriter.java

License:Apache License

@Override
@SuppressWarnings({ "unchecked", "deprecation" })
public Result write(Writable writable, Generator generator) {
    if (writable == null || writable instanceof NullWritable) {
        generator.writeNull();//  ww  w.  j  a va2s .com
    } else if (writable instanceof Text) {
        Text text = (Text) writable;
        generator.writeUTF8String(text.getBytes(), 0, text.getLength());
    } else if (writable instanceof UTF8) {
        UTF8 utf8 = (UTF8) writable;
        generator.writeUTF8String(utf8.getBytes(), 0, utf8.getLength());
    } else if (WritableCompatUtil.isShortWritable(writable)) {
        generator.writeNumber(WritableCompatUtil.unwrap(writable));
    } else if (writable instanceof IntWritable) {
        generator.writeNumber(((IntWritable) writable).get());
    } else if (writable instanceof LongWritable) {
        generator.writeNumber(((LongWritable) writable).get());
    } else if (writable instanceof VLongWritable) {
        generator.writeNumber(((VLongWritable) writable).get());
    } else if (writable instanceof VIntWritable) {
        generator.writeNumber(((VIntWritable) writable).get());
    } else if (writable instanceof ByteWritable) {
        generator.writeNumber(((ByteWritable) writable).get());
    } else if (writable instanceof DoubleWritable) {
        generator.writeNumber(((DoubleWritable) writable).get());
    } else if (writable instanceof FloatWritable) {
        generator.writeNumber(((FloatWritable) writable).get());
    } else if (writable instanceof BooleanWritable) {
        generator.writeBoolean(((BooleanWritable) writable).get());
    } else if (writable instanceof BytesWritable) {
        BytesWritable bw = (BytesWritable) writable;
        generator.writeBinary(bw.getBytes(), 0, bw.getLength());
    } else if (writable instanceof MD5Hash) {
        generator.writeString(writable.toString());
    }

    else if (writable instanceof ArrayWritable) {
        generator.writeBeginArray();
        for (Writable wrt : ((ArrayWritable) writable).get()) {
            Result result = write(wrt, generator);
            if (!result.isSuccesful()) {
                return result;
            }
        }
        generator.writeEndArray();
    }

    else if (writable instanceof AbstractMapWritable) {
        Map<Writable, Writable> map = (Map<Writable, Writable>) writable;

        generator.writeBeginObject();
        // ignore handling sets (which are just maps with null values)
        for (Entry<Writable, Writable> entry : map.entrySet()) {
            String fieldName = entry.getKey().toString();
            if (shouldKeep(generator.getParentPath(), fieldName)) {
                generator.writeFieldName(fieldName);
                Result result = write(entry.getValue(), generator);

                if (!result.isSuccesful()) {
                    return result;
                }
            }
        }
        generator.writeEndObject();
    } else {
        if (writeUnknownTypes) {
            return handleUnknown(writable, generator);
        }
        return Result.FAILED(writable);
    }
    return Result.SUCCESFUL();
}

From source file:org.gestore.hadoop.LongRecordReader.java

License:Apache License

/******
 * Gets one complete entry//from ww w .  j  av a  2s  . c o  m
 */

private int getEntry(Pattern matcherStart, Pattern matcherStop) throws IOException {
    boolean started = false;
    boolean done = false;

    ByteBuffer newLine = ByteBuffer.allocate(2);
    newLine.putChar('\n');
    byte[] newLineBytes = newLine.array();

    Text tempLine = new Text();
    int totalRead = 0;
    int newRead = 0;
    // Discard lines before start record match, save first line that matches regex
    while (!started) {
        if (lastLine.getLength() <= 0) {
            newRead = in.readLine(tempLine, maxLineLength,
                    Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
        } else {
            tempLine = lastLine;
            newRead = lastLine.getLength();
            lastLine = new Text();
        }
        if (newRead == 0) {
            return 0;
        }
        totalRead += newRead;
        Matcher m = matcherStart.matcher(tempLine.toString());
        if (m.matches()) {
            started = true;
            tempLine.append(newLineBytes, 0, newLineBytes.length);
            value.append(tempLine.getBytes(), 0, tempLine.getLength());
            break;
        }
    }

    // Save lines until end record match, save last line
    while (!done) {
        newRead = in.readLine(tempLine, maxLineLength,
                Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
        if (newRead == 0) {
            return totalRead;
        }
        totalRead += newRead;
        Matcher m = matcherStop.matcher(tempLine.toString());
        if (m.matches()) {
            done = true;
            lastLine = tempLine;
            return totalRead -= newRead;
        }
        tempLine.append(newLineBytes, 0, newLineBytes.length);
        value.append(tempLine.getBytes(), 0, tempLine.getLength());
    }
    return totalRead;
}