cn.edu.hfut.dmic.webcollector.crawldb.Injector.java Source code

Java tutorial

Introduction

Here is the source code for cn.edu.hfut.dmic.webcollector.crawldb.Injector.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package cn.edu.hfut.dmic.webcollector.crawldb;

import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.util.CrawlerConfiguration;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 *
 * @author hu
 */
public class Injector {

    public static final Logger LOG = LoggerFactory.getLogger(Injector.class);

    public static void inject(Path crawlPath, CrawlDatums datums, Configuration conf)
            throws IOException, InterruptedException, ClassNotFoundException, Exception {
        Path crawldbPath = new Path(crawlPath, "crawldb");
        FileSystem fs = FileSystem.get(conf);
        Path tempdb = new Path(crawldbPath, "temp");
        if (fs.exists(tempdb)) {
            fs.delete(tempdb);
        }

        SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, new Path(tempdb, "info"), Text.class,
                CrawlDatum.class);

        for (CrawlDatum datum : datums) {

            String key = datum.getKey();
            writer.append(new Text(key), datum);
            LOG.info("inject:" + key);
        }
        writer.close();

        Path[] mergePaths = new Path[] { tempdb };

        Merge.merge(crawlPath, mergePaths, conf, "inject");
        Merge.install(crawlPath, conf);

        if (fs.exists(tempdb)) {
            fs.delete(tempdb);
        }

    }

}