org.commoncrawl.service.parser.server.ParseWorker.java Source code

Introduction

Here is the source code for org.commoncrawl.service.parser.server.ParseWorker.java
Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/
package org.commoncrawl.service.parser.server;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.io.OutputStreamWriter;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.util.LinkedList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.DataOutputBuffer;
import org.commoncrawl.io.NIOHttpHeaders;
import org.commoncrawl.protocol.shared.HTMLMeta;
import org.commoncrawl.protocol.shared.HTMLMetaAttribute;
import org.commoncrawl.service.parser.Link;
import org.commoncrawl.service.parser.ParseResult;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.CharsetUtils;
import org.commoncrawl.util.FlexBuffer;
import org.commoncrawl.util.HttpHeaderUtils;
import org.commoncrawl.util.MimeTypeFilter;
import org.commoncrawl.util.HttpHeaderUtils.ContentTypeAndCharset;
import org.commoncrawl.util.MimeTypeFilter.MimeTypeDisposition;
import org.commoncrawl.util.Tuples.Pair;
import org.w3c.dom.Document;

import com.dappit.Dapper.parser.DocumentBuilder;
import com.dappit.Dapper.parser.InstructionsPool;
import com.dappit.Dapper.parser.MozillaParser;
import com.dappit.Dapper.parser.ParserInitializationException;
import com.dappit.Dapper.parser.ParserInstruction;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.io.ByteProcessor;
import com.google.common.io.ByteStreams;
import com.google.common.io.InputSupplier;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.internal.Streams;
import com.google.gson.stream.JsonWriter;

/**
 * 
 * @author rana
 *
 */
public class ParseWorker implements DocumentBuilder {

    private static final Log LOG = LogFactory.getLog(ParserSlaveServer.class);

    URL baseURL = null;

    ImmutableMap<String, String> linkTypeToSrcMap

            = new ImmutableMap.Builder<String, String>().put("a", "href").put("area", "href").put("frame", "src")
                    .put("iframe", "src").put("script", "src").put("link", "href").put("img", "src").build();

    ImmutableSet<String> ignoreTextTagSet = new ImmutableSet.Builder<String>().add("noscript").build();

    class LinkUnderConstruction {
        public String linkURL = null;
        public String type = null;
        public JsonObject jsonObject = new JsonObject();
        public String linkText = "";

        public LinkUnderConstruction(String linkType, BlockObjectInContext blockInContext) {
            type = linkType;
            jsonObject.addProperty("type", linkType);
            /*
            if (blockInContext != null) {
              JsonObject blockJSONObject = new JsonObject();
              blockJSONObject.addProperty("type", blockInContext.type);
              blockJSONObject.addProperty("oid", blockInContext.id);
              if (blockInContext.htmlId != null)
                blockJSONObject.addProperty("id", blockInContext.htmlId);
              if (blockInContext.classId != null)
                blockJSONObject.addProperty("class", blockInContext.classId);
              if (blockInContext.type.equals("table")) { 
                blockJSONObject.addProperty("t_row", Math.max(0,blockInContext.rowNumber));
                blockJSONObject.addProperty("t_cell", Math.max(0,blockInContext.cellNumber));
              }
              if (blockInContext.parent != null) { 
                blockJSONObject.addProperty("p_oid", blockInContext.parent.id);
            //          if (blockInContext.parent.htmlId != null)
            //            blockJSONObject.addProperty("p_html_id", blockInContext.parent.htmlId);
            //          if (blockInContext.parent.classId != null)
            //            blockJSONObject.addProperty("p_class", blockInContext.parent.classId);
                    
              }
              jsonObject.add("context", blockJSONObject);
            }
            */
        }

        public Link buildLink() {
            if (linkURL != null && linkURL.length() != 0 && !linkURL.startsWith("#")) {
                try {
                    URL url = new URL(baseURL, linkURL);

                    Link link = new Link();
                    link.setUrl(url.toString());
                    jsonObject.addProperty("text", linkText);
                    link.setAttributes(jsonObject.toString());

                    return link;

                } catch (MalformedURLException e) {
                    //LOG.error(CCStringUtils.stringifyException(e));
                }
            }
            return null;

        }
    }

    private ParseResult activeParseResult;

    public void parsePartialHTMLDocument(ParseResult parseResultOut, URL baseURL, String content)
            throws IOException {
        parseResultOut.setParseSuccessful(false);
        this.baseURL = baseURL;
        try {

            String mozillaLibPath = System.getenv().get("MOZILLA_LIB_PATH");
            if (mozillaLibPath == null || !new File(mozillaLibPath).isDirectory()) {
                mozillaLibPath = "/usr/local/lib";
            }
            System.out.println("Mozilla Location:" + mozillaLibPath);

            // init parser ... 
            MozillaParser.init(null, mozillaLibPath);

            MozillaParser parser;

            try {
                parser = new MozillaParser(this);
                activeParseResult = parseResultOut;
                //LOG.info("Parsing Document");
                parser.parse(content.getBytes(Charset.forName("UTF-8")), "utf-8", null);
                activeParseResult = null;
                // set content type ... 
                parseResultOut.setContentType("text/html");
                String finalText = textAccumulator.toString().replaceAll("[ \\t\\x0B\\f]+", " ");
                while (finalText.indexOf("\n \n") != -1)
                    finalText = finalText.replaceAll("(\\n \\n)+", "\n");
                finalText = finalText.replaceAll("[\\n]+", "\n");
                parseResultOut.setText(finalText);
                parseResultOut.setParseSuccessful(true);
            } catch (ParserInitializationException e) {
                LOG.error(CCStringUtils.stringifyException(e));
                parseResultOut.setParseFailureReason("Parser Initialization Failed!");
            } catch (Exception e) {
                parseResultOut.setParseFailureReason(CCStringUtils.stringifyException(e));
                LOG.error(parseResultOut);
            }
        } catch (ParserInitializationException e) {
            parseResultOut.setParseFailureReason("Parser Initialization Failed!");
            LOG.error(CCStringUtils.stringifyException(e));
            throw new IOException(e);
        }
    }

    public void parseDocument(ParseResult parseResultOut, long domainId, long documentId, URL baseURL,
            String rawHeaders, FlexBuffer data) throws IOException {

        parseResultOut.setParseSuccessful(false);

        this.baseURL = baseURL;

        if (data.getCount() != 0) {
            try {
                String mozillaLibPath = System.getenv().get("MOZILLA_LIB_PATH");
                if (mozillaLibPath == null || !new File(mozillaLibPath).isDirectory()) {
                    mozillaLibPath = "/usr/local/lib";
                }
                System.out.println("Mozilla Location:" + mozillaLibPath);

                // init parser ... 
                MozillaParser.init(null, mozillaLibPath);
                // load headers ... 
                NIOHttpHeaders headers = NIOHttpHeaders.parseHttpHeaders(rawHeaders);
                // detect content type ... 
                ContentTypeAndCharset contentTypeInfo = new ContentTypeAndCharset();
                HttpHeaderUtils.parseContentType(headers, contentTypeInfo);
                //LOG.info("ContentType:" + contentTypeInfo._contentType + " Charset:" + contentTypeInfo._charset);
                // ok now extract charset if possible ... 
                Pair<Integer, Charset> charsetTuple = CharsetUtils.bestEffortDetectCharset(rawHeaders, data.get(),
                        data.getOffset(), data.getCount());
                if (charsetTuple == null) {
                    charsetTuple = new Pair<Integer, Charset>(CharsetUtils.CHARSET_SRC_NO_MATCH,
                            Charset.forName("ISO-8859-1"));
                }
                // decode bytes ... and convert to utf-8
                ByteBuffer utf8Bytes = null;
                try {
                    if (charsetTuple.e1.toString().equalsIgnoreCase("utf-8")) {
                        //LOG.info("Input Charset is utf-8, transposing source bytes to dest bytes");
                        if (data.getOffset() == 0) {
                            utf8Bytes = ByteBuffer.wrap(data.get(), 0, data.getCount());
                        } else {
                            byte[] buffer = new byte[data.getCount()];
                            System.arraycopy(data.get(), data.getOffset(), buffer, 0, data.getCount());
                            utf8Bytes = ByteBuffer.wrap(buffer);
                        }
                    } else {
                        CharBuffer ucs2Chars = charsetTuple.e1
                                .decode(ByteBuffer.wrap(data.get(), data.getOffset(), data.getCount()));
                        utf8Bytes = Charset.forName("UTF-8").encode(ucs2Chars);
                    }
                } catch (Exception e) {
                    LOG.error(CCStringUtils.stringifyException(e));
                    parseResultOut.setParseFailureReason(CCStringUtils.stringifyException(e));
                    // this should not have happened... we consider this unrecoverable
                    throw new IOException(e);
                }
                if (utf8Bytes == null || utf8Bytes.remaining() == 0) {
                    parseResultOut.setParseFailureReason("Invalid UTF-8 bytes detected for doc:" + baseURL
                            + " detector:" + charsetTuple.e0 + " Charset:" + charsetTuple.e1);
                    throw new IOException(parseResultOut.getParseFailureReason());
                }
                //LOG.info("UTF-8 Data Length:" + utf8Bytes.remaining());
                MimeTypeDisposition disposition = MimeTypeFilter
                        .checkMimeTypeDisposition(contentTypeInfo._contentType);
                //LOG.info("MimeType Disposition:"+ disposition);
                if (disposition == MimeTypeDisposition.ACCEPT_HTML) {
                    // ok ready to send to mozilla ... 
                    MozillaParser parser;
                    try {
                        parser = new MozillaParser(this);
                        activeParseResult = parseResultOut;
                        //LOG.info("Parsing Document");
                        parser.parse(utf8Bytes.array(), "utf-8", null);
                        activeParseResult = null;
                        // set content type ... 
                        parseResultOut.setContentType(contentTypeInfo._contentType);
                        String finalText = textAccumulator.toString().replaceAll("[ \\t\\x0B\\f]+", " ");
                        while (finalText.indexOf("\n \n") != -1)
                            finalText = finalText.replaceAll("(\\n \\n)+", "\n");
                        finalText = finalText.replaceAll("[\\n]+", "\n");
                        parseResultOut.setText(finalText);
                        parseResultOut.setParseSuccessful(true);
                    } catch (ParserInitializationException e) {
                        LOG.error(CCStringUtils.stringifyException(e));
                        parseResultOut.setParseFailureReason("Parser Initialization Failed!");
                    } catch (Exception e) {
                        parseResultOut.setParseFailureReason(CCStringUtils.stringifyException(e));
                        LOG.error(parseResultOut);
                    }
                } else if (disposition == MimeTypeDisposition.ACCEPT_OTHER) {

                } else {
                    parseResultOut.setParseFailureReason("Unsupported ContentType:" + contentTypeInfo._contentType);
                }

            } catch (ParserInitializationException e) {
                parseResultOut.setParseFailureReason("Parser Initialization Failed!");
                LOG.error(CCStringUtils.stringifyException(e));
                throw new IOException(e);
            }
        }
    }

    public static void main(String[] args) throws IOException {
        String baseURL = "http://unknown.com/";
        NIOHttpHeaders headers = null;
        if (args.length != 0) {
            for (int i = 0; i < args.length; ++i) {
                if (args[i].equalsIgnoreCase("--noHeaders")) {
                    headers = new NIOHttpHeaders();
                    headers.add("content-type", "text/html");
                } else if (args[i].equalsIgnoreCase("--baseURL")) {
                    baseURL = args[++i];
                }
            }
        }
        URL baseURLObj;
        try {
            baseURLObj = new URL(baseURL);
        } catch (MalformedURLException e2) {
            LOG.error(CCStringUtils.stringifyException(e2));
            throw new IOException("Invalid Base Link");
        }

        final DataOutputBuffer headerBuffer = new DataOutputBuffer();
        final DataOutputBuffer contentBuffer = new DataOutputBuffer();
        final boolean processHeaders = (headers == null);

        try {
            ByteStreams.readBytes(new InputSupplier<InputStream>() {

                @Override
                public InputStream getInput() throws IOException {
                    return System.in;
                }
            }, new ByteProcessor<Long>() {

                @Override
                public Long getResult() {
                    return 0L;
                }

                int currLineCharCount = 0;
                boolean processingHeaders = processHeaders;

                @Override
                public boolean processBytes(byte[] buf, int start, int length) throws IOException {

                    if (processingHeaders) {
                        int current = start;
                        int end = current + length;
                        while (processingHeaders && current != end) {
                            if (buf[current] != '\r' && buf[current] != '\n') {
                                currLineCharCount++;
                            } else if (buf[current] == '\n') {
                                if (currLineCharCount == 0) {
                                    headerBuffer.write(buf, start, current - start + 1);
                                    processingHeaders = false;
                                }
                                currLineCharCount = 0;
                            }
                            current++;
                        }
                        if (processingHeaders) {
                            headerBuffer.write(buf, start, length);
                        } else {
                            length -= current - start;
                            start = current;
                        }
                    }
                    if (!processingHeaders) {
                        contentBuffer.write(buf, start, length);
                    }
                    return true;
                }
            });

            LOG.info("CONTENT LEN:" + contentBuffer.getLength());
            //System.out.println(new String(contentBuffer.getData(),0,contentBuffer.getLength(),Charset.forName("UTF-8")));
            // decode header bytes ... 
            String header = "";
            if (headerBuffer.getLength() != 0) {
                try {
                    header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(),
                            Charset.forName("UTF-8"));
                } catch (Exception e) {
                    LOG.warn(CCStringUtils.stringifyException(e));
                    header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(),
                            Charset.forName("ASCII"));
                }
            } else {
                if (headers != null) {
                    header = headers.toString();
                }
            }
            LOG.info("HEADER LEN:" + header.length());
            System.out.println(header);

            //LOG.info("Parsing Document");
            ParseWorker worker = new ParseWorker();
            ParseResult result = new ParseResult();
            worker.parseDocument(result, 0L, 0L, baseURLObj, header,
                    new FlexBuffer(contentBuffer.getData(), 0, contentBuffer.getLength()));
            LOG.info("Parse Result:" + result.getParseSuccessful());
            //LOG.info("Parse Data:" + result.toString());

            OutputStreamWriter outputWriter = new OutputStreamWriter(System.out, "UTF-8");
            JsonElement resultObj = parseResultToJSON(result);
            JsonWriter writer = new JsonWriter(outputWriter);
            writer.setIndent("    ");
            writer.setHtmlSafe(true);
            writer.setLenient(true);
            Streams.write(resultObj, writer);
            writer.flush();

            outputWriter.write("******** TEXT OUTPUT **********\n");
            outputWriter.write(result.getText());
            outputWriter.flush();
        } catch (IOException e1) {
            // TODO Auto-generated catch block
            e1.printStackTrace();
        }
    }

    public static final JsonObject parseResultToJSON(ParseResult result) {
        JsonObject objectOut = new JsonObject();
        if (result.isFieldDirty(ParseResult.Field_DOMAINID)) {
            objectOut.addProperty("domainId", result.getDomainId());
        }
        if (result.isFieldDirty(ParseResult.Field_DOCID)) {
            objectOut.addProperty("docId", result.getDocId());
        }
        if (result.isFieldDirty(ParseResult.Field_CONTENTTYPE)) {
            objectOut.addProperty("contentType", result.getContentType().toString());
        }
        if (result.isFieldDirty(ParseResult.Field_CONTEXT)) {
            objectOut.addProperty("context", result.getContext().toString());
        }
        if (result.isFieldDirty(ParseResult.Field_PARSESUCCESSFUL)) {
            objectOut.addProperty("parseSuccessful", result.getParseSuccessful());
        }
        if (result.isFieldDirty(ParseResult.Field_PARSEFAILUREREASON)) {
            objectOut.addProperty("parseFailureReason", result.getParseFailureReason());
        }
        if (result.isFieldDirty(ParseResult.Field_TITLE)) {
            objectOut.addProperty("title", result.getTitle());
        }
        if (result.getMetaTags().size() != 0) {
            JsonArray metaTagArray = new JsonArray();

            for (int vidx0 = 0; vidx0 < result.getMetaTags().size(); vidx0++) {
                JsonObject metaTagJSON = new JsonObject();
                HTMLMeta htmlMeta = result.getMetaTags().get(vidx0);
                for (HTMLMetaAttribute attribute : htmlMeta.getAttributes()) {
                    metaTagJSON.addProperty(attribute.getName(), attribute.getValue());
                }
                metaTagArray.add(metaTagJSON);
            }
            objectOut.add("meta_tags", metaTagArray);
        }
        if (result.getExtractedLinks().size() != 0) {
            JsonArray extractedLinksArray = new JsonArray();

            for (int vidx0 = 0; vidx0 < result.getExtractedLinks().size(); vidx0++) {
                JsonObject extractedLinkJSON = new JsonObject();
                Link link = result.getExtractedLinks().get(vidx0);
                extractedLinkJSON.addProperty("url", link.getUrl());
                extractedLinkJSON.addProperty("attributes", link.getAttributes());
                extractedLinksArray.add(extractedLinkJSON);
            }
            objectOut.add("extracted_links", extractedLinksArray);
        }
        if (result.isFieldDirty(ParseResult.Field_TEXT)) {
            objectOut.addProperty("text", result.getText().toString());
        }
        return objectOut;
    }

    int inHeadTag = 0;
    int inBase = 0;
    int blockId = 0;
    int inTable = 0;

    LinkUnderConstruction activeLink = null;
    BlockObjectInContext blockInConstruction = null;
    LinkedList<LinkUnderConstruction> linksUnderConstruction = new LinkedList<LinkUnderConstruction>();
    StringBuffer textAccumulator = new StringBuffer();

    static class BlockObjectInContext {
        public BlockObjectInContext parent;
        public String type = "";
        public int id;
        public int rowNumber = -1;
        public int cellNumber = -1;
        public String classId = null;
        public String htmlId = null;

        public BlockObjectInContext(BlockObjectInContext parentObject, String type, int id) {
            this.parent = parentObject;
            this.type = type;
            this.id = id;
        }
    }

    static ImmutableSet<String> blockLevelHTMLTags = new ImmutableSet.Builder<String>().add("address")
            .add("blockquote").add("div").add("dl").add("fieldset").add("form").add("h1").add("h2").add("h3")
            .add("h4").add("h5").add("h6").add("hr").add("noscript").add("ol").add("p").add("pre").add("table")
            .add("ul").add("dd").add("dt").add("li").add("tbody").add("td").add("tfoot").add("th").add("thead")
            .add("tr").add("button").add("del").add("ins").add("map").add("object").add("script").build();

    @Override
    public Document buildDocument(InstructionsPool instructionsPool, FileOutputStream optionalOutputStream)
            throws IOException {

        //LOG.info("Build Document Called");
        List<Integer> operations = instructionsPool.operations;
        List<String> arguments = instructionsPool.arguments;
        LinkedList<Integer> nodeStack = new LinkedList<Integer>();
        LinkedList<BlockObjectInContext> blockStack = new LinkedList<BlockObjectInContext>();
        HTMLMeta meta = null;

        for (int i = 0; i < operations.size(); i++) {
            int domOperation = operations.get(i);
            String domArgument = arguments.get(i);
            //System.out.println("Operation :" + ParserInstruction.getOperationString(domOperation)+" Arg:~" + domArgument+"~");
            switch (domOperation) {
            // Open node :
            case ParserInstruction.OpenNode:
            case ParserInstruction.AddLeaf: {
                activeLink = null;
                blockInConstruction = null;
                String nodeName = domArgument.toLowerCase();

                // append new-line of start of a block level tag ... 
                if (domOperation == ParserInstruction.OpenNode && blockLevelHTMLTags.contains(nodeName)) {
                    if (textAccumulator.length() != 0
                            && textAccumulator.charAt(textAccumulator.length() - 1) != '\n')
                        textAccumulator.append("\n");
                }

                if (nodeName.equals("meta")) {
                    meta = new HTMLMeta();
                } else if (linkTypeToSrcMap.containsKey(nodeName)) {
                    //LOG.info("Node:" + nodeName + " is of type Link. Adding to LinksUnderConst");
                    activeLink = new LinkUnderConstruction(nodeName, blockStack.peek());
                    linksUnderConstruction.push(activeLink);
                } else if (nodeName.equals("head")) {
                    inHeadTag++;
                } else if (nodeName.equals("base")) {
                    if (inHeadTag != 0) {
                        inBase++;
                    }
                } else if (nodeName.equals("table") || nodeName.equals("div")) {
                    blockInConstruction = new BlockObjectInContext(blockStack.peek(), nodeName, ++blockId);
                    blockStack.push(blockInConstruction);
                } else if (nodeName.equals("tr") || nodeName.equals("th")) {
                    BlockObjectInContext table = blockStack.peek();
                    if (table != null) {
                        table.rowNumber++;
                        table.cellNumber = -1;
                    }
                } else if (nodeName.equals("td")) {
                    BlockObjectInContext table = blockStack.peek();
                    if (table != null) {
                        table.cellNumber++;
                    }
                }
                nodeStack.push(i);
            }
                break;
            // Close node :
            case ParserInstruction.CloseNode:
            case ParserInstruction.CloseLeaf: {
                int arguementPos = nodeStack.pop();
                String nodeName = arguments.get(arguementPos).toLowerCase();

                // append new-line of start of a block level tag ... 
                if (domOperation == ParserInstruction.CloseNode && blockLevelHTMLTags.contains(nodeName)) {
                    if (textAccumulator.length() != 0
                            && textAccumulator.charAt(textAccumulator.length() - 1) != '\n')
                        textAccumulator.append("\n");
                }

                //LOG.info("Close Node Called on Node:" + nodeName);
                if (nodeName.equals("head")) {
                    inHeadTag--;
                } else if (nodeName.equals("base")) {
                    if (inHeadTag != 0) {
                        inBase--;
                    }
                } else if (linkTypeToSrcMap.containsKey(nodeName)) {
                    //LOG.info("Node:" + nodeName + " is a Link Type");
                    LinkUnderConstruction linkPartial = linksUnderConstruction.pop();
                    if (linkPartial != null) {
                        //LOG.info("POPed a partial LinkObject of type:" + linkPartial.type);
                        Link link = linkPartial.buildLink();
                        if (link != null) {
                            activeParseResult.getExtractedLinks().add(link);
                        }
                    }
                } else if (nodeName.equals("table") || nodeName.equals("div")) {
                    blockStack.pop();
                } else if (nodeName.equals("meta")) {
                    if (meta != null) {
                        activeParseResult.getMetaTags().add(meta);
                        meta = null;
                    }
                }
                if (textAccumulator.length() != 0
                        && !Character.isWhitespace(textAccumulator.charAt(textAccumulator.length() - 1))) {
                    textAccumulator.append(" ");
                }

            }
                break;
            case ParserInstruction.AddText: {
                Integer arguementPos = nodeStack.peek();
                String nodeName = (arguementPos != null) ? arguments.get(arguementPos).toLowerCase() : null;
                LinkUnderConstruction link = linksUnderConstruction.peek();

                if (link != null) {
                    if (link.linkText.length() != 0)
                        link.linkText += " ";
                    link.linkText += domArgument.trim();
                }
                if (nodeName == null || !ignoreTextTagSet.contains(nodeName.toLowerCase())) {
                    textAccumulator.append(domArgument);
                }

            }
                break;
            //        case ParserInstruction.AddContent:
            //          System.out.println("AddContent:"+domArgument);
            //          break;

            case ParserInstruction.WriteAttributeKey: {

                // grab key name .. 
                String key = domArgument.toLowerCase();

                // and lookahead one to grab attribute value ... 
                i++;

                if (i < operations.size() && operations.get(i) == ParserInstruction.WriteAttributeValue) {
                    // grab value ... 
                    String value = arguments.get(i);

                    // if metatag capture key/value ... 
                    if (meta != null) {
                        // create a new attribute object  
                        HTMLMetaAttribute attribute = new HTMLMetaAttribute();

                        attribute.setName(key);
                        attribute.setValue(value);

                        // append to meta tag 
                        meta.getAttributes().add(attribute);
                    } else {
                        if (key.equals("href") && inBase != 0) {
                            if (value.length() != 0) {
                                try {
                                    baseURL = new URL(value);
                                } catch (Exception e) {
                                    LOG.error(CCStringUtils.stringifyException(e));
                                    throw new IOException(e);
                                }
                            }
                        } else if (activeLink != null) {
                            if (linkTypeToSrcMap.get(activeLink.type).equalsIgnoreCase(key)) {
                                activeLink.linkURL = value;
                            } else {
                                activeLink.jsonObject.addProperty(key, value);
                            }
                        } else if (blockInConstruction != null) {
                            if (key.equals("class")) {
                                blockInConstruction.classId = value;
                            } else if (key.equals("id")) {
                                blockInConstruction.htmlId = value;
                            }
                        }
                    }
                } else {
                    // rewind and let outer control block deal with it 
                    --i;
                }
            }
                break;

            case ParserInstruction.SetTitle: {
                activeParseResult.setTitle(domArgument);
            }
                break;
            //        case ParserInstruction.AddEntity:
            //          System.out.println("AddEntity:" + domArgument);
            //            break;
            //        case ParserInstruction.AddComment:
            //          System.out.println("AddComment:" + domArgument); 
            //            break;        case ParserInstruction.SetTitle:
            //          System.out.println("SetTitle:" + domArgument);
            //            break;
            //        }
            }
        }
        return null;
    }
}