Example usage for org.apache.hadoop.mapred JobConf JobConf

List of usage examples for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf() 

Source Link

Document

Construct a map/reduce job configuration.

Usage

From source file:com.finderbots.miner.MinerTool.java

License:Apache License

public static void main(String[] args) throws IOException {

    MinerOptions options = new MinerOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*from www  .  j  a  va  2 s.c om*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Build and run the flow.

    try {

        Path workingDirPath = new Path(options.getWorkingDir());

        JobConf conf = new JobConf();
        FileSystem fs = workingDirPath.getFileSystem(conf);
        setupWorkingDir(fs, workingDirPath, options.getUrlsFile());

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
        if (latestDirPath == null) {
            error("No previous cycle output dirs exist in " + workingDirPath, parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy fetcherPolicy = new FetcherPolicy();
        fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        // We only care about mime types that the Tika HTML parser can handle,
        // so restrict it to the same.
        Set<String> validMimeTypes = new HashSet<String>();
        Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
        for (MediaType supportedType : supportedTypes) {
            validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
        }
        fetcherPolicy.setValidMimeTypes(validMimeTypes);

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        String crawlUrlFiltersFile = options.getRegexUrlFiltersFile();
        List<String> crawlUrlPatterns = RegexUrlFilter.getUrlFilterPatterns(crawlUrlFiltersFile);
        BaseUrlFilter crawlUrlFilter = new RegexUrlFilter(
                crawlUrlPatterns.toArray(new String[crawlUrlPatterns.size()]));

        // setting up a miner filter we will mine only pages that match one of the urls
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        List<String> mineUrlPatterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        BaseUrlFilter mineUrlFilter = new RegexUrlFilter(
                mineUrlPatterns.toArray(new String[mineUrlPatterns.size()]));

        // Let's limit our crawl to two loops
        for (int curLoop = 1; curLoop <= options.getNumLoops(); curLoop++) {
            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop);
            Flow flow = MinerWorkflow.createWebMiningWorkflow(crawlDbPath, curLoopDirPath, fetcherPolicy,
                    userAgent, options, crawlUrlFilter, mineUrlFilter);
            flow.complete();

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }

    } catch (Exception e) {
        System.err.println("Exception running job: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.finderbots.miner2.pinterest.PinterestCrawlAndMinerTool.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();
    CmdLineParser parser = new CmdLineParser(options);

    try {//from   w  w w.jav a 2s  .  c  o m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // First check if the user wants to clean
        if (options.isCleanOutputDir()) {
            if (fs.exists(outputPath)) {
                fs.delete(outputPath, true);
            }
        }

        // See if the user isn't starting from scratch then set up the
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
            // In the /crawldb dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy;
        if (options.getCrawlDuration() != 0) {
            defaultPolicy = new AdaptiveFetcherPolicy(options.getEndCrawlTime(), options.getCrawlDelay());
        } else {
            defaultPolicy = new FetcherPolicy();
        }
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setRequestTimeout(10L * 1000L);//10 seconds

        // COMPLETE for crawling a single site, EFFICIENT for many sites
        if (options.getCrawlPolicy().equals(Options.IMPOLITE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE);
        } else if (options.getCrawlPolicy().equals(Options.EFFICIENT_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT);
        } else if (options.getCrawlPolicy().equals(Options.COMPLETE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE);
        }

        // It is a good idea to set up a crawl duration when running long crawls as you may
        // end up in situations where the fetch slows down due to a 'long tail' and by
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != Options.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlDatumFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlDatumFilter(patterns.toArray(new String[patterns.size()]));

        // get a list of patterns which tell the miner which URLs to include or exclude.
        patterns.clear();
        RegexUrlStringFilter urlsToMineFilter = null;
        String regexUrlsToMineFiltersFile = options.getRegexUrlToMineFile();
        AnalyzeHtml analyzer = null;
        if (regexUrlsToMineFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlsToMineFiltersFile);
            urlsToMineFilter = new RegexUrlStringFilter(patterns.toArray(new String[patterns.size()]));
            analyzer = new AnalyzeHtml(urlsToMineFilter);
        }

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = PinterestCrawlAndMinerWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy,
                    userAgent, urlFilter, analyzer, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            flow.writeDOT("valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.finderbots.miner2.tomatoes.RTCriticsCrawlAndMinerTool.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*from   w w  w.ja v  a2s  .  com*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // First check if the user wants to clean
        if (options.isCleanOutputDir()) {
            if (fs.exists(outputPath)) {
                fs.delete(outputPath, true);
            }
        }

        // See if the user isn't starting from scratch then set up the
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
            // In the /crawldb dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy;
        if (options.getCrawlDuration() != 0) {
            defaultPolicy = new AdaptiveFetcherPolicy(options.getEndCrawlTime(), options.getCrawlDelay());
        } else {
            defaultPolicy = new FetcherPolicy();
        }
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setRequestTimeout(10L * 1000L);//10 seconds

        // COMPLETE for crawling a single site, EFFICIENT for many sites
        if (options.getCrawlPolicy().equals(Options.IMPOLITE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE);
        } else if (options.getCrawlPolicy().equals(Options.EFFICIENT_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT);
        } else if (options.getCrawlPolicy().equals(Options.COMPLETE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE);
        }

        // It is a good idea to set up a crawl duration when running long crawls as you may
        // end up in situations where the fetch slows down due to a 'long tail' and by
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != Options.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlDatumFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlDatumFilter(patterns.toArray(new String[patterns.size()]));

        // get a list of patterns which tell the miner which URLs to include or exclude.
        patterns.clear();
        RegexUrlStringFilter urlsToMineFilter = null;
        String regexUrlsToMineFiltersFile = options.getRegexUrlToMineFile();
        MineRTCriticsPreferences prefsAnalyzer = null;
        if (regexUrlsToMineFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlsToMineFiltersFile);
            urlsToMineFilter = new RegexUrlStringFilter(patterns.toArray(new String[patterns.size()]));
            prefsAnalyzer = new MineRTCriticsPreferences(urlsToMineFilter);
        }

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = RTCriticsCrawlAndMinerWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy,
                    userAgent, urlFilter, prefsAnalyzer, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            flow.writeDOT("valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.finderbots.utilities.ExportPinterestPrefsTool.java

License:Apache License

public static void main(String[] args) {
    ExportToolOptions options = new ExportToolOptions();
    CmdLineParser parser = new CmdLineParser(options);
    String outputDirName;/*from   www  . j av a  2  s  . com*/

    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    try {
        outputDirName = options.getOutputDir();
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        Path crawlPath = new Path(options.getCrawlDir());
        FileSystem fs = outputPath.getFileSystem(conf);

        // get the urls of users, urls of followed people, make sure they are unique, create an index
        // and write the ids out as CSV file of prefs for mahout input.
        Flow exportPinterestPrefsWorkFlow = ExportPinterestPrefsWorkflow.createFlow(crawlPath, options);
        exportPinterestPrefsWorkFlow.complete();
    } catch (PlannerException e) {
        e.writeDOT("failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.finderbots.utilities.ExportTool.java

License:Apache License

public static void main(String[] args) {
    ExportToolOptions options = new ExportToolOptions();
    CmdLineParser parser = new CmdLineParser(options);
    String outputDirName;//from  w  ww. j av  a 2  s. c  om

    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    try {
        outputDirName = options.getOutputDir();
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        Path crawlPath = new Path(options.getCrawlDir());
        FileSystem fs = outputPath.getFileSystem(conf);

        // create a flow that takes all parsed text and accumulates into a single sink in mahout format
        Flow exportToMahoutFlow = ExportAllToMahoutWorkflow.createFlow(crawlPath, options);
        exportToMahoutFlow.complete();
    } catch (PlannerException e) {
        e.writeDOT("failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.firewallid.io.HBaseWrite.java

public void save(String tableName, JavaPairRDD<String, String> savePairRDD, String destColumn)
        throws IOException {
    /* Check hbase table */
    if (!HBaseTableUtils.istableExists(tableName)) {
        throw new TableNotFoundException();
    }//from  w ww .  jav a 2 s  .  c  om

    /* Check column family */
    if (!HBaseTableUtils.isFamilyExists(tableName, destColumn.split(":")[0])) {
        throw new NoSuchColumnFamilyException();
    }

    /* Save to HBase */
    JobConf jobConf = new JobConf();
    jobConf.setOutputFormat(TableOutputFormat.class);
    jobConf.set(TableOutputFormat.OUTPUT_TABLE, tableName);

    savePairRDD.mapToPair((Tuple2<String, String> t) -> convertRowToPut(t._1, destColumn, t._2))
            .filter((Tuple2<ImmutableBytesWritable, Put> t1) -> t1 != null).saveAsHadoopDataset(jobConf);
}

From source file:com.firewallid.io.HBaseWrite.java

public void save(String tableName, JavaPairRDD<String, Map<String, String>> savePairRDD,
        List<String> destFamilys) throws IOException {
    /* Check hbase table */
    if (!HBaseTableUtils.istableExists(tableName)) {
        throw new TableNotFoundException();
    }/*  ww  w  . ja  v a 2  s.  co m*/

    /* Check column family */
    if (!HBaseTableUtils.isFamilyExists(tableName, destFamilys)) {
        throw new NoSuchColumnFamilyException();
    }

    /* Save to HBase */
    JobConf jobConf = new JobConf();
    jobConf.setOutputFormat(TableOutputFormat.class);
    jobConf.set(TableOutputFormat.OUTPUT_TABLE, tableName);

    savePairRDD.mapToPair((Tuple2<String, Map<String, String>> t) -> convertRowToPut(t))
            .filter((Tuple2<ImmutableBytesWritable, Put> t1) -> t1 != null).saveAsHadoopDataset(jobConf);
}

From source file:com.flaptor.hounder.crawler.Nutch9Fetcher.java

License:Apache License

/**
 * Create a nutch fetchlist segment from the provided list of pages.
 * @param fetchlist the list of pages from which to build the segment.
 *//*from  ww w  . j av a 2  s . co  m*/
private String buildSegment(FetchList fetchlist) throws IOException {
    // create the segment dir
    String segmentDir = getNewSegmentDir();
    Path output = new Path(segmentDir, CrawlDatum.GENERATE_DIR_NAME);
    JobConf job = new JobConf();
    job.setOutputPath(output);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    // job.setOutputFormat(SequenceFileOutputFormat.class);
    // job.setOutputKeyComparatorClass(HashComparator.class);
    RecordWriter writer = new SequenceFileOutputFormat().getRecordWriter(null, job, "fetcher",
            new NoProgress());
    for (com.flaptor.hounder.crawler.pagedb.Page page : fetchlist) {
        Text key = new Text(page.getUrl());
        CrawlDatum value = new CrawlDatum(); // TODO: try taking this line outside of the loop
        writer.write(key, value);
    }
    writer.close(null);
    return segmentDir;
}

From source file:com.github.gaoyangthu.core.hbase.ConfigurationUtils.java

License:Apache License

/**
 * Creates a new {@link org.apache.hadoop.conf.Configuration} by merging the given configurations.
 * Ordering is important - the second configuration overriding values in the first.
 * //from www .ja  va 2  s.c o m
 * @param one configuration to read from. May be null.
 * @param two configuration to read from. May be null.
 * @return the result of merging the two configurations.
 */
public static Configuration merge(Configuration one, Configuration two) {
    if (one == null) {
        if (two == null) {
            return new JobConf();
        }
        return new JobConf(two);
    }

    Configuration c = new JobConf(one);

    if (two == null) {
        return c;
    }

    for (Map.Entry<String, String> entry : two) {
        c.set(entry.getKey(), entry.getValue());
    }

    return c;
}

From source file:com.google.cloud.dataflow.contrib.sorter.ExternalSorter.java

License:Apache License

/**
 * Initializes the hadoop sorter. Does some local file system setup, and is somewhat expensive
 * (~20 ms on local machine). Only executed when necessary.
 *//*from  ww w  .ja  va2  s  .  c  om*/
private void initHadoopSorter() throws IOException {
    if (!initialized) {
        tempDir = new Path(options.getTempLocation(), "tmp" + UUID.randomUUID().toString());
        paths = new Path[] { new Path(tempDir, "test.seq") };

        JobConf conf = new JobConf();
        writer = SequenceFile.createWriter(conf, Writer.valueClass(BytesWritable.class),
                Writer.keyClass(BytesWritable.class), Writer.file(paths[0]),
                Writer.compression(CompressionType.NONE));

        FileSystem fs = FileSystem.getLocal(conf);
        sorter = new SequenceFile.Sorter(fs, new BytesWritable.Comparator(), BytesWritable.class,
                BytesWritable.class, conf);
        sorter.setMemory(options.getMemoryMB() * 1024 * 1024);

        initialized = true;
    }
}