Example usage for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf()

Source Link

Document

Construct a map/reduce job configuration.

Usage

From source file:bixo.examples.crawl.SimpleCrawlTool.java

License:Apache License

public static void main(String[] args) {
    SimpleCrawlToolOptions options = new SimpleCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*from   w w  w . j a  va 2s. co  m*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user isn't starting from scratch then set up the 
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /urls subdir
            // In the /urls dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.toUri().toString();
            setLoopLoggerFile(curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        // It is a good idea to set up a crawl duration when running long crawls as you may 
        // end up in situations where the fetch slows down due to a 'long tail' and by 
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != SimpleCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        if (domain != null) {
            urlFilter = new DomainUrlFilter(domain);
        }

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.toUri().toString();
            setLoopLoggerFile(curLoopDirName, curLoop);

            Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                    urlFilter, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            //              flow.writeDOT("build/valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.examples.crawl.SimpleCrawlWorkflowLRTest.java

License:Apache License

@Test
public void testNotLosingFetchedUrls() throws Throwable {
    String baseDirName = "build/test/SimpleCrawlWorkflowLRTest/output";
    JobConf conf = new JobConf();
    Path baseDirPath = new Path(baseDirName);
    FileSystem fs = baseDirPath.getFileSystem(conf);

    HadoopUtils.safeRemove(fs, baseDirPath);
    Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 0);
    Path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

    SimpleCrawlTool.importOneDomain("localhost:8089", crawlDbPath, conf);
    curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 1);

    FetcherPolicy defaultPolicy = new FetcherPolicy();
    defaultPolicy.setCrawlDelay(1);/*from w w  w . ja  v a 2 s  . c  o  m*/
    defaultPolicy.setFetcherMode(FetcherMode.COMPLETE);
    BaseUrlFilter urlFilter = new BaseUrlFilter() {

        @Override
        public boolean isRemove(UrlDatum datum) {
            return false;
        }
    };

    SimpleCrawlToolOptions options = new SimpleCrawlToolOptions();
    UserAgent userAgent = new UserAgent("test", "test@domain.com", "http://test.domain.com");
    Server server = null;
    try {
        server = startServer(new FakeWebSiteHandler(), 8089);
        Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                urlFilter, options);
        flow.complete();

        // Update the crawlDb path
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Now we should have an output/1-<timestamp>/ directory, where the
        // /urls dir has 11 entries with
        // one being previously crawled, and the other 10 being pending.

        Hfs crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
        TupleEntryIterator iter = crawldbTap.openForRead(conf);

        int numFetched = 0;
        int numPending = 0;
        while (iter.hasNext()) {
            CrawlDbDatum datum = new CrawlDbDatum(iter.next());
            UrlStatus status = datum.getLastStatus();
            int crawlDepth = datum.getCrawlDepth();
            if (datum.getLastFetched() != 0) {
                numFetched += 1;

                assertEquals(UrlStatus.FETCHED, status);
                assertEquals(0, crawlDepth);
            } else {
                numPending += 1;
                assertEquals(UrlStatus.UNFETCHED, status);
                assertEquals(1, crawlDepth);
            }
        }

        assertEquals(1, numFetched);
        assertEquals(10, numPending);

        // Do it one more time, to verify status gets propagated forward.
        curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 2);

        flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter,
                options);
        flow.complete();
        // Update crawldb path
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
        iter = crawldbTap.openForRead(conf);

        numFetched = 0;
        numPending = 0;
        int numDepth0 = 0;
        int numDepth1 = 0;
        int numDepth2 = 0;
        while (iter.hasNext()) {
            CrawlDbDatum datum = new CrawlDbDatum(iter.next());
            UrlStatus status = datum.getLastStatus();
            int depth = datum.getCrawlDepth();

            if (datum.getLastFetched() != 0) {
                numFetched += 1;
                assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.FETCHED, status);
            } else {
                numPending += 1;
                assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.UNFETCHED, status);
            }

            if (depth == 0) {
                numDepth0 += 1;
            } else if (depth == 1) {
                numDepth1 += 1;
            } else if (depth == 2) {
                numDepth2 += 1;
            } else {
                fail("Invalid crawl depth for " + datum.getUrl());
            }

            // System.out.println(String.format("URL %s has status %s, last fetch %d, and depth %d",
            // datum.getUrl(), datum.getLastStatus(),
            // datum.getLastFetched(), depth));
        }

        assertEquals(11, numFetched);
        assertEquals(100, numPending);

        assertEquals(1, numDepth0);
        assertEquals(10, numDepth1);
        assertEquals(100, numDepth2);
    } catch (Throwable t) {
        fail(t.getMessage());
    } finally {
        if (server != null) {
            server.stop();
        }
    }

}

From source file:bixo.examples.crawl.SimpleStatusTool.java

License:Apache License

public static void main(String[] args) {
    SimpleStatusToolOptions options = new SimpleStatusToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {//from  ww w.  j  av  a  2 s  .  c  o  m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    String crawlDirName = options.getCrawlDir();

    try {
        JobConf conf = new JobConf();
        Path crawlDirPath = new Path(crawlDirName);
        FileSystem fs = crawlDirPath.getFileSystem(conf);

        if (!fs.exists(crawlDirPath)) {
            System.err.println("Prior crawl output directory does not exist: " + crawlDirName);
            System.exit(-1);
        }

        // Skip Hadoop/Cascading DEBUG messages.
        Logger.getRootLogger().setLevel(Level.INFO);

        boolean exportDb = options.isExportDb();
        if (exportDb) {
            Path latestCrawlDirPath = CrawlDirUtils.findLatestLoopDir(fs, crawlDirPath);
            processCrawlDb(conf, latestCrawlDirPath, exportDb);
        } else {
            int prevLoop = -1;
            Path curDirPath = null;
            while ((curDirPath = CrawlDirUtils.findNextLoopDir(fs, crawlDirPath, prevLoop)) != null) {
                String curDirName = curDirPath.toUri().toString();
                LOGGER.info("");
                LOGGER.info("================================================================");
                LOGGER.info("Processing " + curDirName);
                LOGGER.info("================================================================");

                int curLoop = CrawlDirUtils.extractLoopNumber(curDirPath);
                if (curLoop != prevLoop + 1) {
                    LOGGER.warn(String.format("Missing directories between %d and %d", prevLoop, curLoop));
                }

                prevLoop = curLoop;

                // Process the status and crawldb in curPath
                processStatus(conf, curDirPath);
                processCrawlDb(conf, curDirPath, exportDb);

            }
        }
    } catch (Throwable t) {
        LOGGER.error("Exception running tool", t);
        System.exit(-1);
    }
}

From source file:bixo.examples.JDBCCrawlTool.java

License:Open Source License

public static void main(String[] args) {
    JDBCCrawlToolOptions options = new JDBCCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/* w w  w.ja va  2 s  .co m*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    if (domain.startsWith("http")) {
        System.err.println(
                "The target domain should be specified as just the host, without the http protocol: " + domain);
        printUsageAndExit(parser);
    }

    if (!domain.equals("localhost") && (domain.split("\\.").length < 2)) {
        System.err.println(
                "The target domain should be a valid paid-level domain or subdomain of the same: " + domain);
        printUsageAndExit(parser);
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user is starting from scratch
        if (options.getDbLocation() == null) {
            if (fs.exists(outputPath)) {
                System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName);
                System.out.println("Warning: Delete the output dir before running");
                fs.delete(outputPath, true);
            }
        } else {
            Path dbLocationPath = new Path(options.getDbLocation());
            if (!fs.exists(dbLocationPath)) {
                fs.mkdirs(dbLocationPath);
            }
        }

        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.toUri().toString();
            setLoopLoggerFile(curLoopDirName, 0);

            importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf);
        }

        Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (inputPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        int startLoop = CrawlDirUtils.extractLoopNumber(inputPath);
        int endLoop = startLoop + options.getNumLoops();

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        BaseUrlFilter urlFilter = new DomainUrlFilter(domain);

        // Now we're ready to start looping, since we've got our current settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDir.toUri().toString();
            setLoopLoggerFile(curLoopDirName, curLoop);

            Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter,
                    options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation());
            flow.complete();
            // flow.writeDOT("build/valid-flow.dot");

            // Input for the next round is our current output
            inputPath = curLoopDir;
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
    JDBCTapFactory.shutdown();
}

From source file:bixo.examples.SimpleCrawlTool.java

License:Open Source License

public static void main(String[] args) {
    SimpleCrawlToolOptions options = new SimpleCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {// ww  w .  j av a 2  s. c  o m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    if (domain.startsWith("http")) {
        System.err.println(
                "The target domain should be specified as just the host, without the http protocol: " + domain);
        printUsageAndExit(parser);
    }

    if (!domain.equals("localhost") && (domain.split("\\.").length < 2)) {
        System.err.println(
                "The target domain should be a valid paid-level domain or subdomain of the same: " + domain);
        printUsageAndExit(parser);
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user isn't starting from scratch then set up the 
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /urls subdir
            // In the /urls dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.toUri().toString();
            setLoopLoggerFile(curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);
            importOneDomain(domain, crawlDbPath, conf);
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        // It is a good idea to set up a crawl duration when running long crawls as you may 
        // end up in situations where the fetch slows down due to a 'long tail' and by 
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != SimpleCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to 
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = new DomainUrlFilter(domain);

        // OK, now we're ready to start looping, since we've got our current settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.toUri().toString();
            setLoopLoggerFile(curLoopDirName, curLoop);

            Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                    urlFilter, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            //              flow.writeDOT("build/valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.examples.webmining.DemoWebMiningTool.java

License:Apache License

public static void main(String[] args) throws IOException {

    DemoWebMiningOptions options = new DemoWebMiningOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*from ww  w.ja  v  a  2 s .c  om*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Build and run the flow.

    try {

        Path workingDirPath = new Path(options.getWorkingDir());

        JobConf conf = new JobConf();
        FileSystem fs = workingDirPath.getFileSystem(conf);
        setupWorkingDir(fs, workingDirPath, CrawlConfig.SEED_URLS_FILENAME);

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
        if (latestDirPath == null) {
            error("No previous cycle output dirs exist in " + workingDirPath, parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy fetcherPolicy = new FetcherPolicy();
        fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        // We only care about mime types that the Tika HTML parser can handle,
        // so restrict it to the same.
        Set<String> validMimeTypes = new HashSet<String>();
        Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
        for (MediaType supportedType : supportedTypes) {
            validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
        }
        fetcherPolicy.setValidMimeTypes(validMimeTypes);

        // Let's limit our crawl to two loops 
        for (int curLoop = 1; curLoop <= 2; curLoop++) {
            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop);
            Flow flow = DemoWebMiningWorkflow.createWebMiningWorkflow(crawlDbPath, curLoopDirPath,
                    fetcherPolicy, userAgent, options);
            flow.complete();

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }

    } catch (Exception e) {
        System.err.println("Exception running job: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.examples.webmining.WebMiningTool.java

License:Apache License

public static void main(String[] args) throws IOException {

    WebMiningOptions options = new WebMiningOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {//from  w  w w .j  a va 2s .  co m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Build and run the flow.

    try {

        Path workingDirPath = new Path(options.getWorkingDir());

        JobConf conf = new JobConf();
        FileSystem fs = workingDirPath.getFileSystem(conf);
        setupWorkingDir(fs, workingDirPath, CrawlConfig.SEED_URLS_FILENAME);

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
        if (latestDirPath == null) {
            error("No previous cycle output dirs exist in " + workingDirPath, parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy fetcherPolicy = new FetcherPolicy();
        fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        // We only care about mime types that the Tika HTML parser can handle,
        // so restrict it to the same.
        Set<String> validMimeTypes = new HashSet<String>();
        Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
        for (MediaType supportedType : supportedTypes) {
            validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
        }
        fetcherPolicy.setValidMimeTypes(validMimeTypes);

        // Let's limit our crawl to two loops 
        for (int curLoop = 1; curLoop <= 2; curLoop++) {
            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop);
            Flow flow = WebMiningWorkflow.createWebMiningWorkflow(crawlDbPath, curLoopDirPath, fetcherPolicy,
                    userAgent, options, curLoop == 1);
            flow.complete();

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }

    } catch (Exception e) {
        System.err.println("Exception running job: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.fetcher.FetcherTest.java

License:Open Source License

@Test
public void testStaleConnection() throws Exception {
    System.setProperty("bixo.root.level", "TRACE");

    String workingFolder = "build/it/FetcherTest/testStaleConnection/working";
    String inputPath = makeCrawlDb(workingFolder, "src/it/resources/apple-pages.txt");
    Lfs in = new Lfs(new SequenceFile(UrlDatum.FIELDS), inputPath, true);
    String outPath = "build/it/FetcherTest/testStaleConnection/out";
    Lfs content = new Lfs(new SequenceFile(FetchedDatum.FIELDS), outPath + "/content", true);
    Lfs status = new Lfs(new SequenceFile(StatusDatum.FIELDS), outPath + "/status", true);

    Pipe pipe = new Pipe("urlSource");

    UserAgent userAgent = new FirefoxUserAgent();
    FetcherPolicy fetcherPolicy = new FetcherPolicy();
    fetcherPolicy.setMaxRequestsPerConnection(1);
    fetcherPolicy.setCrawlDelay(5 * 1000L);
    BaseFetcher fetcher = new SimpleHttpFetcher(2, fetcherPolicy, userAgent);
    BaseScoreGenerator scorer = new FixedScoreGenerator();
    FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);

    FlowConnector flowConnector = new FlowConnector();

    Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
    flow.complete();/* w w  w.  ja va 2 s . c o  m*/

    // Test for all valid fetches.
    Lfs validate = new Lfs(new SequenceFile(StatusDatum.FIELDS), outPath + "/status");
    TupleEntryIterator tupleEntryIterator = validate.openForRead(new JobConf());
    while (tupleEntryIterator.hasNext()) {
        TupleEntry entry = tupleEntryIterator.next();
        StatusDatum sd = new StatusDatum(entry);
        if (sd.getStatus() != UrlStatus.FETCHED) {
            LOGGER.error(String.format("Fetched failed! Status is %s for %s", sd.getStatus(), sd.getUrl()));
            BaseFetchException e = sd.getException();
            if (e != null) {
                LOGGER.error("Fetched failed due to exception", e);
            }

            Assert.fail("Status not equal to FETCHED");
        }
    }
}

From source file:bixo.fetcher.FetcherTest.java

License:Open Source License

@Test
public void testRunFetcher() throws Exception {
    System.setProperty("bixo.root.level", "TRACE");

    String workingFolder = "build/test-it/FetcherTest/testRunFetcher";
    String inputPath = makeCrawlDb(workingFolder, "src/it/resources/top10urls.txt");
    Lfs in = new Lfs(new SequenceFile(UrlDatum.FIELDS), inputPath, true);
    Lfs content = new Lfs(new SequenceFile(FetchedDatum.FIELDS), workingFolder + "/content", true);
    Lfs status = new Lfs(new TextLine(), workingFolder + "/status", true);

    Pipe pipe = new Pipe("urlSource");

    UserAgent userAgent = new FirefoxUserAgent();
    BaseFetcher fetcher = new SimpleHttpFetcher(10, userAgent);
    BaseScoreGenerator scorer = new FixedScoreGenerator();
    FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);

    FlowConnector flowConnector = new FlowConnector();

    Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
    flow.complete();/*from  ww w .  ja v  a 2s . c  o m*/

    // Test for 10 good fetches.
    Lfs validate = new Lfs(new SequenceFile(FetchedDatum.FIELDS), workingFolder + "/content");
    TupleEntryIterator tupleEntryIterator = validate.openForRead(new JobConf());
    int fetchedPages = 0;
    while (tupleEntryIterator.hasNext()) {
        TupleEntry entry = tupleEntryIterator.next();
        new FetchedDatum(entry);
        fetchedPages += 1;
    }

    Assert.assertEquals(10, fetchedPages);
}

From source file:cascading.avro.AvroSchemeTest.java

License:Apache License

@Test
public void listOrMapInsideListTest() throws Exception {
    final Schema schema = new Schema.Parser().parse(getClass().getResourceAsStream("test4.avsc"));
    final AvroScheme scheme = new AvroScheme(schema);

    final Fields fields = new Fields("aListOfListOfInt", "aListOfMapToLong");

    final Lfs lfs = new Lfs(scheme, tempDir.getRoot().toString());
    HadoopFlowProcess writeProcess = new HadoopFlowProcess(new JobConf());
    final TupleEntryCollector collector = lfs.openForWrite(writeProcess);

    List<Map<String, Long>> aListOfMapToLong = new ArrayList<Map<String, Long>>();
    Map<String, Long> aMapToLong = new HashMap<String, Long>();
    aMapToLong.put("one", 1L);
    aMapToLong.put("two", 2L);
    aListOfMapToLong.add(aMapToLong);/*from w  w  w.  j  a v a2  s  .c o m*/

    List<List<Integer>> aListOfListOfInt = new ArrayList<List<Integer>>();
    List<Integer> aListOfInt = new LinkedList<Integer>();
    aListOfInt.add(0);
    aListOfInt.add(1);
    aListOfListOfInt.add(aListOfInt);

    write(scheme, collector, new TupleEntry(fields, new Tuple(aListOfListOfInt, aListOfMapToLong)));
    collector.close();

    HadoopFlowProcess readProcess = new HadoopFlowProcess(new JobConf());
    final TupleEntryIterator iterator = lfs.openForRead(readProcess);
    assertTrue(iterator.hasNext());
    final TupleEntry readEntry1 = iterator.next();

    List<Integer> outListOfInt = (List) ((List) readEntry1.getObject("aListOfListOfInt")).get(0);
    Map<Utf8, Long> outMapToLong = (Map) ((List) readEntry1.getObject("aListOfMapToLong")).get(0);

    assertEquals(Integer.valueOf(0), outListOfInt.get(0));
    assertEquals(Integer.valueOf(1), outListOfInt.get(1));
    assertEquals(Long.valueOf(1L), outMapToLong.get("one"));
    assertEquals(Long.valueOf(2L), outMapToLong.get("two"));
    assertTrue(!iterator.hasNext());

}