Example usage for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf()

Source Link

Document

Construct a map/reduce job configuration.

Usage

From source file:com.finderbots.miner.MinerTool.java

License:Apache License

public static void main(String[] args) throws IOException {

    MinerOptions options = new MinerOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*from www  .  j  a  va  2 s.c om*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Build and run the flow.

    try {

        Path workingDirPath = new Path(options.getWorkingDir());

        JobConf conf = new JobConf();
        FileSystem fs = workingDirPath.getFileSystem(conf);
        setupWorkingDir(fs, workingDirPath, options.getUrlsFile());

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
        if (latestDirPath == null) {
            error("No previous cycle output dirs exist in " + workingDirPath, parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy fetcherPolicy = new FetcherPolicy();
        fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        // We only care about mime types that the Tika HTML parser can handle,
        // so restrict it to the same.
        Set<String> validMimeTypes = new HashSet<String>();
        Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
        for (MediaType supportedType : supportedTypes) {
            validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
        }
        fetcherPolicy.setValidMimeTypes(validMimeTypes);

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        String crawlUrlFiltersFile = options.getRegexUrlFiltersFile();
        List<String> crawlUrlPatterns = RegexUrlFilter.getUrlFilterPatterns(crawlUrlFiltersFile);
        BaseUrlFilter crawlUrlFilter = new RegexUrlFilter(
                crawlUrlPatterns.toArray(new String[crawlUrlPatterns.size()]));

        // setting up a miner filter we will mine only pages that match one of the urls
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        List<String> mineUrlPatterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        BaseUrlFilter mineUrlFilter = new RegexUrlFilter(
                mineUrlPatterns.toArray(new String[mineUrlPatterns.size()]));

        // Let's limit our crawl to two loops
        for (int curLoop = 1; curLoop <= options.getNumLoops(); curLoop++) {
            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop);
            Flow flow = MinerWorkflow.createWebMiningWorkflow(crawlDbPath, curLoopDirPath, fetcherPolicy,
                    userAgent, options, crawlUrlFilter, mineUrlFilter);
            flow.complete();

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }

    } catch (Exception e) {
        System.err.println("Exception running job: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.finderbots.miner2.pinterest.PinterestCrawlAndMinerTool.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();
    CmdLineParser parser = new CmdLineParser(options);

    try {//from   w  w w.jav a 2s  .  c  o m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // First check if the user wants to clean
        if (options.isCleanOutputDir()) {
            if (fs.exists(outputPath)) {
                fs.delete(outputPath, true);
            }
        }

        // See if the user isn't starting from scratch then set up the
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
            // In the /crawldb dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy;
        if (options.getCrawlDuration() != 0) {
            defaultPolicy = new AdaptiveFetcherPolicy(options.getEndCrawlTime(), options.getCrawlDelay());
        } else {
            defaultPolicy = new FetcherPolicy();
        }
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setRequestTimeout(10L * 1000L);//10 seconds

        // COMPLETE for crawling a single site, EFFICIENT for many sites
        if (options.getCrawlPolicy().equals(Options.IMPOLITE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE);
        } else if (options.getCrawlPolicy().equals(Options.EFFICIENT_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT);
        } else if (options.getCrawlPolicy().equals(Options.COMPLETE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE);
        }

        // It is a good idea to set up a crawl duration when running long crawls as you may
        // end up in situations where the fetch slows down due to a 'long tail' and by
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != Options.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlDatumFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlDatumFilter(patterns.toArray(new String[patterns.size()]));

        // get a list of patterns which tell the miner which URLs to include or exclude.
        patterns.clear();
        RegexUrlStringFilter urlsToMineFilter = null;
        String regexUrlsToMineFiltersFile = options.getRegexUrlToMineFile();
        AnalyzeHtml analyzer = null;
        if (regexUrlsToMineFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlsToMineFiltersFile);
            urlsToMineFilter = new RegexUrlStringFilter(patterns.toArray(new String[patterns.size()]));
            analyzer = new AnalyzeHtml(urlsToMineFilter);
        }

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = PinterestCrawlAndMinerWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy,
                    userAgent, urlFilter, analyzer, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            flow.writeDOT("valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.finderbots.miner2.tomatoes.RTCriticsCrawlAndMinerTool.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*from   w w  w.ja v  a2s  .  com*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // First check if the user wants to clean
        if (options.isCleanOutputDir()) {
            if (fs.exists(outputPath)) {
                fs.delete(outputPath, true);
            }
        }

        // See if the user isn't starting from scratch then set up the
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
            // In the /crawldb dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy;
        if (options.getCrawlDuration() != 0) {
            defaultPolicy = new AdaptiveFetcherPolicy(options.getEndCrawlTime(), options.getCrawlDelay());
        } else {
            defaultPolicy = new FetcherPolicy();
        }
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setRequestTimeout(10L * 1000L);//10 seconds

        // COMPLETE for crawling a single site, EFFICIENT for many sites
        if (options.getCrawlPolicy().equals(Options.IMPOLITE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE);
        } else if (options.getCrawlPolicy().equals(Options.EFFICIENT_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT);
        } else if (options.getCrawlPolicy().equals(Options.COMPLETE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE);
        }

        // It is a good idea to set up a crawl duration when running long crawls as you may
        // end up in situations where the fetch slows down due to a 'long tail' and by
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != Options.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlDatumFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlDatumFilter(patterns.toArray(new String[patterns.size()]));

        // get a list of patterns which tell the miner which URLs to include or exclude.
        patterns.clear();
        RegexUrlStringFilter urlsToMineFilter = null;
        String regexUrlsToMineFiltersFile = options.getRegexUrlToMineFile();
        MineRTCriticsPreferences prefsAnalyzer = null;
        if (regexUrlsToMineFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlsToMineFiltersFile);
            urlsToMineFilter = new RegexUrlStringFilter(patterns.toArray(new String[patterns.size()]));
            prefsAnalyzer = new MineRTCriticsPreferences(urlsToMineFilter);
        }

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = RTCriticsCrawlAndMinerWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy,
                    userAgent, urlFilter, prefsAnalyzer, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            flow.writeDOT("valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.finderbots.utilities.ExportPinterestPrefsTool.java

License:Apache License

public static void main(String[] args) {
    ExportToolOptions options = new ExportToolOptions();
    CmdLineParser parser = new CmdLineParser(options);
    String outputDirName;/*from   www  . j av a  2  s  . com*/

    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    try {
        outputDirName = options.getOutputDir();
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        Path crawlPath = new Path(options.getCrawlDir());
        FileSystem fs = outputPath.getFileSystem(conf);

        // get the urls of users, urls of followed people, make sure they are unique, create an index
        // and write the ids out as CSV file of prefs for mahout input.
        Flow exportPinterestPrefsWorkFlow = ExportPinterestPrefsWorkflow.createFlow(crawlPath, options);
        exportPinterestPrefsWorkFlow.complete();
    } catch (PlannerException e) {
        e.writeDOT("failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.finderbots.utilities.ExportTool.java

License:Apache License

public static void main(String[] args) {
    ExportToolOptions options = new ExportToolOptions();
    CmdLineParser parser = new CmdLineParser(options);
    String outputDirName;//from  w  ww. j av  a 2  s. c  om

    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    try {
        outputDirName = options.getOutputDir();
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        Path crawlPath = new Path(options.getCrawlDir());
        FileSystem fs = outputPath.getFileSystem(conf);

        // create a flow that takes all parsed text and accumulates into a single sink in mahout format
        Flow exportToMahoutFlow = ExportAllToMahoutWorkflow.createFlow(crawlPath, options);
        exportToMahoutFlow.complete();
    } catch (PlannerException e) {
        e.writeDOT("failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.firewallid.io.HBaseWrite.java

public void save(String tableName, JavaPairRDD<String, String> savePairRDD, String destColumn)
        throws IOException {
    /* Check hbase table */
    if (!HBaseTableUtils.istableExists(tableName)) {
        throw new TableNotFoundException();
    }//from  w ww .  jav a 2 s  .  c  om

    /* Check column family */
    if (!HBaseTableUtils.isFamilyExists(tableName, destColumn.split(":")[0])) {
        throw new NoSuchColumnFamilyException();
    }

    /* Save to HBase */
    JobConf jobConf = new JobConf();
    jobConf.setOutputFormat(TableOutputFormat.class);
    jobConf.set(TableOutputFormat.OUTPUT_TABLE, tableName);

    savePairRDD.mapToPair((Tuple2<String, String> t) -> convertRowToPut(t._1, destColumn, t._2))
            .filter((Tuple2<ImmutableBytesWritable, Put> t1) -> t1 != null).saveAsHadoopDataset(jobConf);
}

From source file:com.firewallid.io.HBaseWrite.java

public void save(String tableName, JavaPairRDD<String, Map<String, String>> savePairRDD,
        List<String> destFamilys) throws IOException {
    /* Check hbase table */
    if (!HBaseTableUtils.istableExists(tableName)) {
        throw new TableNotFoundException();
    }/*  ww  w  . ja  v a 2  s.  co m*/

    /* Check column family */
    if (!HBaseTableUtils.isFamilyExists(tableName, destFamilys)) {
        throw new NoSuchColumnFamilyException();
    }

    /* Save to HBase */
    JobConf jobConf = new JobConf();
    jobConf.setOutputFormat(TableOutputFormat.class);
    jobConf.set(TableOutputFormat.OUTPUT_TABLE, tableName);

    savePairRDD.mapToPair((Tuple2<String, Map<String, String>> t) -> convertRowToPut(t))
            .filter((Tuple2<ImmutableBytesWritable, Put> t1) -> t1 != null).saveAsHadoopDataset(jobConf);
}

From source file:com.flaptor.hounder.crawler.Nutch9Fetcher.java

License:Apache License

/**
 * Create a nutch fetchlist segment from the provided list of pages.
 * @param fetchlist the list of pages from which to build the segment.
 *//*from  ww w  . j av a 2  s . co  m*/
private String buildSegment(FetchList fetchlist) throws IOException {
    // create the segment dir
    String segmentDir = getNewSegmentDir();
    Path output = new Path(segmentDir, CrawlDatum.GENERATE_DIR_NAME);
    JobConf job = new JobConf();
    job.setOutputPath(output);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    // job.setOutputFormat(SequenceFileOutputFormat.class);
    // job.setOutputKeyComparatorClass(HashComparator.class);
    RecordWriter writer = new SequenceFileOutputFormat().getRecordWriter(null, job, "fetcher",
            new NoProgress());
    for (com.flaptor.hounder.crawler.pagedb.Page page : fetchlist) {
        Text key = new Text(page.getUrl());
        CrawlDatum value = new CrawlDatum(); // TODO: try taking this line outside of the loop
        writer.write(key, value);
    }
    writer.close(null);
    return segmentDir;
}

From source file:com.github.gaoyangthu.core.hbase.ConfigurationUtils.java

License:Apache License

/**
 * Creates a new {@link org.apache.hadoop.conf.Configuration} by merging the given configurations.
 * Ordering is important - the second configuration overriding values in the first.
 * //from www .ja  va 2  s.c o m
 * @param one configuration to read from. May be null.
 * @param two configuration to read from. May be null.
 * @return the result of merging the two configurations.
 */
public static Configuration merge(Configuration one, Configuration two) {
    if (one == null) {
        if (two == null) {
            return new JobConf();
        }
        return new JobConf(two);
    }

    Configuration c = new JobConf(one);

    if (two == null) {
        return c;
    }

    for (Map.Entry<String, String> entry : two) {
        c.set(entry.getKey(), entry.getValue());
    }

    return c;
}

From source file:com.google.cloud.dataflow.contrib.sorter.ExternalSorter.java

License:Apache License

/**
 * Initializes the hadoop sorter. Does some local file system setup, and is somewhat expensive
 * (~20 ms on local machine). Only executed when necessary.
 *//*from  ww w  .ja  va2  s  .  c  om*/
private void initHadoopSorter() throws IOException {
    if (!initialized) {
        tempDir = new Path(options.getTempLocation(), "tmp" + UUID.randomUUID().toString());
        paths = new Path[] { new Path(tempDir, "test.seq") };

        JobConf conf = new JobConf();
        writer = SequenceFile.createWriter(conf, Writer.valueClass(BytesWritable.class),
                Writer.keyClass(BytesWritable.class), Writer.file(paths[0]),
                Writer.compression(CompressionType.NONE));

        FileSystem fs = FileSystem.getLocal(conf);
        sorter = new SequenceFile.Sorter(fs, new BytesWritable.Comparator(), BytesWritable.class,
                BytesWritable.class, conf);
        sorter.setMemory(options.getMemoryMB() * 1024 * 1024);

        initialized = true;
    }
}