org.cloudata.examples.web.WebTableJob.java Source code

Java tutorial

Introduction

Here is the source code for org.cloudata.examples.web.WebTableJob.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.cloudata.examples.web;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.cloudata.core.client.Cell;
import org.cloudata.core.client.CTable;
import org.cloudata.core.client.Row;
import org.cloudata.core.common.conf.CloudataConf;
import org.cloudata.core.tablet.TableSchema;

/**
 *  ?? ? ? ?//   
 * Map? ?  ???  , ?  
 * webtable? ?? , termtable?  
 * webtable?  ?? insert, termtable? webtable  term?  insert
 * @author babokim
 *
 */
public class WebTableJob {
    public static final Log LOG = LogFactory.getLog(WebTableJob.class.getName());
    public static final String WEB_TABLE = "T_WEB";
    public static final String[] WEB_TABLE_COLUMNS = { "url", "title", "contents", "anchor" };

    public void exec(String[] options) throws Exception {
        if (options.length < 1) {
            System.out.println("Usage: java TestWebPage <num of repeats> webtable <inputPath>");
            System.exit(0);
        }
        //WebTable ?
        CloudataConf nconf = new CloudataConf();
        if (!CTable.existsTable(nconf, WEB_TABLE)) {
            TableSchema webTableInfo = new TableSchema(WEB_TABLE, "Test", WEB_TABLE_COLUMNS);
            webTableInfo.setNumOfVersion(2);
            CTable.createTable(nconf, webTableInfo);
        }

        Path tempOutputPath = new Path("WebTableJob_" + System.currentTimeMillis());

        JobConf jobConf = new JobConf(WebTableJob.class);
        jobConf.setJobName("WebTableJob" + "(" + new Date() + ")");
        FileInputFormat.addInputPath(jobConf, new Path(options[0]));

        //<MAP>
        jobConf.setMapperClass(WebTableMap.class);
        jobConf.setInputFormat(TextInputFormat.class);
        jobConf.setMaxMapAttempts(0);
        //</MAP>

        //Map Only
        jobConf.setNumReduceTasks(0);
        FileOutputFormat.setOutputPath(jobConf, tempOutputPath);

        //Run Job
        JobClient.runJob(jobConf);

        //delete temp output path
        FileSystem fs = FileSystem.get(jobConf);
        fs.delete(tempOutputPath, true);
    }

    static class WebTableMap implements Mapper<WritableComparable, Writable, Text, Text> {
        static final String DELIM = "==NEWLINE==";
        static final String TITLE = "title:";
        static final String LINK = "link:";
        static final String BODY = "body:";
        private List<String> docLines = new ArrayList<String>();
        private CTable webTable;
        int insertedCount = 0;
        private boolean skip = true;
        private static final int MAX_VALUE = 500 * 1024; //(500K)

        public void map(WritableComparable key, Writable value, OutputCollector<Text, Text> collector,
                Reporter reporter) throws IOException {
            Text tvalue = (Text) value;

            String valueStr = new String(tvalue.getBytes(), 0, tvalue.getLength(), "EUC-KR");

            if (DELIM.equals(valueStr)) {
                skip = false;
                if (docLines.size() > 0) {
                    parseAndStoreDoc();
                    docLines.clear();
                }
            } else {
                if (!skip) {
                    docLines.add(valueStr);
                }
            }
        }

        private void parseAndStoreDoc() throws IOException {
            String link = null;
            String title = null;
            StringBuffer contents = new StringBuffer();

            for (String eachLine : docLines) {
                if (eachLine.startsWith(TITLE)) {
                    title = getTitle(eachLine) + (new Date()).toString();
                } else if (eachLine.startsWith(LINK)) {
                    link = getLink(eachLine);
                } else {
                    if (contents.length() == 0 && eachLine.startsWith(BODY)) {
                        contents.append(eachLine.substring(BODY.length()).trim()).append("\n");
                    } else {
                        contents.append(eachLine).append("\n");
                    }
                }
            }

            contents.append((new Date()).toString());
            if (contents.length() > MAX_VALUE) {
                LOG.info("Skip " + link + " cause data too big: " + contents.length());
                return;
            }

            Row.Key rowKey = new Row.Key(String.valueOf(link.hashCode()));

            try {
                Row row = new Row(rowKey);
                row.addCell("url", new Cell(Cell.Key.EMPTY_KEY, link.getBytes()));
                row.addCell("title", new Cell(Cell.Key.EMPTY_KEY, link.getBytes()));
                row.addCell("contents", new Cell(Cell.Key.EMPTY_KEY, contents.toString().getBytes("EUC-KR")));
                webTable.put(row);
                insertedCount++;
                if (insertedCount % 100 == 0) {
                    LOG.info(insertedCount + " inserted, rowKey=" + link + ", title=" + title);
                }
            } catch (Exception e) {
                LOG.error(rowKey + "," + e.getMessage(), e);
                return;
            }
        }

        private String getTitle(String line) throws IOException {
            String title = line.substring(TITLE.length()).trim();
            return title;
        }

        private String getLink(String line) throws IOException {
            String link = line.substring(LINK.length()).trim();
            return link;
        }

        public void configure(JobConf jobConf) {
            try {
                CloudataConf conf = new CloudataConf();
                webTable = CTable.openTable(conf, WEB_TABLE);
            } catch (IOException e) {
                LOG.error(e);
            }
        }

        public void close() throws IOException {
            LOG.info("Total Insert Record: " + insertedCount);
        }
    }
}