Example usage for org.apache.hadoop.mapred JobConf JobConf

List of usage examples for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf() 

Source Link

Document

Construct a map/reduce job configuration.

Usage

From source file:bixo.examples.crawl.SimpleCrawlTool.java

License:Apache License

public static void main(String[] args) {
    SimpleCrawlToolOptions options = new SimpleCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*from   w w  w . j a  va 2s. co  m*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user isn't starting from scratch then set up the 
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /urls subdir
            // In the /urls dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.toUri().toString();
            setLoopLoggerFile(curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        // It is a good idea to set up a crawl duration when running long crawls as you may 
        // end up in situations where the fetch slows down due to a 'long tail' and by 
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != SimpleCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        if (domain != null) {
            urlFilter = new DomainUrlFilter(domain);
        }

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.toUri().toString();
            setLoopLoggerFile(curLoopDirName, curLoop);

            Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                    urlFilter, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            //              flow.writeDOT("build/valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.examples.crawl.SimpleCrawlWorkflowLRTest.java

License:Apache License

@Test
public void testNotLosingFetchedUrls() throws Throwable {
    String baseDirName = "build/test/SimpleCrawlWorkflowLRTest/output";
    JobConf conf = new JobConf();
    Path baseDirPath = new Path(baseDirName);
    FileSystem fs = baseDirPath.getFileSystem(conf);

    HadoopUtils.safeRemove(fs, baseDirPath);
    Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 0);
    Path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

    SimpleCrawlTool.importOneDomain("localhost:8089", crawlDbPath, conf);
    curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 1);

    FetcherPolicy defaultPolicy = new FetcherPolicy();
    defaultPolicy.setCrawlDelay(1);/*from w w  w . ja  v a 2 s  . c  o  m*/
    defaultPolicy.setFetcherMode(FetcherMode.COMPLETE);
    BaseUrlFilter urlFilter = new BaseUrlFilter() {

        @Override
        public boolean isRemove(UrlDatum datum) {
            return false;
        }
    };

    SimpleCrawlToolOptions options = new SimpleCrawlToolOptions();
    UserAgent userAgent = new UserAgent("test", "test@domain.com", "http://test.domain.com");
    Server server = null;
    try {
        server = startServer(new FakeWebSiteHandler(), 8089);
        Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                urlFilter, options);
        flow.complete();

        // Update the crawlDb path
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Now we should have an output/1-<timestamp>/ directory, where the
        // /urls dir has 11 entries with
        // one being previously crawled, and the other 10 being pending.

        Hfs crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
        TupleEntryIterator iter = crawldbTap.openForRead(conf);

        int numFetched = 0;
        int numPending = 0;
        while (iter.hasNext()) {
            CrawlDbDatum datum = new CrawlDbDatum(iter.next());
            UrlStatus status = datum.getLastStatus();
            int crawlDepth = datum.getCrawlDepth();
            if (datum.getLastFetched() != 0) {
                numFetched += 1;

                assertEquals(UrlStatus.FETCHED, status);
                assertEquals(0, crawlDepth);
            } else {
                numPending += 1;
                assertEquals(UrlStatus.UNFETCHED, status);
                assertEquals(1, crawlDepth);
            }
        }

        assertEquals(1, numFetched);
        assertEquals(10, numPending);

        // Do it one more time, to verify status gets propagated forward.
        curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 2);

        flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter,
                options);
        flow.complete();
        // Update crawldb path
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
        iter = crawldbTap.openForRead(conf);

        numFetched = 0;
        numPending = 0;
        int numDepth0 = 0;
        int numDepth1 = 0;
        int numDepth2 = 0;
        while (iter.hasNext()) {
            CrawlDbDatum datum = new CrawlDbDatum(iter.next());
            UrlStatus status = datum.getLastStatus();
            int depth = datum.getCrawlDepth();

            if (datum.getLastFetched() != 0) {
                numFetched += 1;
                assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.FETCHED, status);
            } else {
                numPending += 1;
                assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.UNFETCHED, status);
            }

            if (depth == 0) {
                numDepth0 += 1;
            } else if (depth == 1) {
                numDepth1 += 1;
            } else if (depth == 2) {
                numDepth2 += 1;
            } else {
                fail("Invalid crawl depth for " + datum.getUrl());
            }

            // System.out.println(String.format("URL %s has status %s, last fetch %d, and depth %d",
            // datum.getUrl(), datum.getLastStatus(),
            // datum.getLastFetched(), depth));
        }

        assertEquals(11, numFetched);
        assertEquals(100, numPending);

        assertEquals(1, numDepth0);
        assertEquals(10, numDepth1);
        assertEquals(100, numDepth2);
    } catch (Throwable t) {
        fail(t.getMessage());
    } finally {
        if (server != null) {
            server.stop();
        }
    }

}

From source file:bixo.examples.crawl.SimpleStatusTool.java

License:Apache License

public static void main(String[] args) {
    SimpleStatusToolOptions options = new SimpleStatusToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {//from  ww w.  j  av  a  2 s  .  c  o  m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    String crawlDirName = options.getCrawlDir();

    try {
        JobConf conf = new JobConf();
        Path crawlDirPath = new Path(crawlDirName);
        FileSystem fs = crawlDirPath.getFileSystem(conf);

        if (!fs.exists(crawlDirPath)) {
            System.err.println("Prior crawl output directory does not exist: " + crawlDirName);
            System.exit(-1);
        }

        // Skip Hadoop/Cascading DEBUG messages.
        Logger.getRootLogger().setLevel(Level.INFO);

        boolean exportDb = options.isExportDb();
        if (exportDb) {
            Path latestCrawlDirPath = CrawlDirUtils.findLatestLoopDir(fs, crawlDirPath);
            processCrawlDb(conf, latestCrawlDirPath, exportDb);
        } else {
            int prevLoop = -1;
            Path curDirPath = null;
            while ((curDirPath = CrawlDirUtils.findNextLoopDir(fs, crawlDirPath, prevLoop)) != null) {
                String curDirName = curDirPath.toUri().toString();
                LOGGER.info("");
                LOGGER.info("================================================================");
                LOGGER.info("Processing " + curDirName);
                LOGGER.info("================================================================");

                int curLoop = CrawlDirUtils.extractLoopNumber(curDirPath);
                if (curLoop != prevLoop + 1) {
                    LOGGER.warn(String.format("Missing directories between %d and %d", prevLoop, curLoop));
                }

                prevLoop = curLoop;

                // Process the status and crawldb in curPath
                processStatus(conf, curDirPath);
                processCrawlDb(conf, curDirPath, exportDb);

            }
        }
    } catch (Throwable t) {
        LOGGER.error("Exception running tool", t);
        System.exit(-1);
    }
}

From source file:bixo.examples.JDBCCrawlTool.java

License:Open Source License

public static void main(String[] args) {
    JDBCCrawlToolOptions options = new JDBCCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/* w w  w.ja va  2 s  .co m*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    if (domain.startsWith("http")) {
        System.err.println(
                "The target domain should be specified as just the host, without the http protocol: " + domain);
        printUsageAndExit(parser);
    }

    if (!domain.equals("localhost") && (domain.split("\\.").length < 2)) {
        System.err.println(
                "The target domain should be a valid paid-level domain or subdomain of the same: " + domain);
        printUsageAndExit(parser);
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user is starting from scratch
        if (options.getDbLocation() == null) {
            if (fs.exists(outputPath)) {
                System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName);
                System.out.println("Warning: Delete the output dir before running");
                fs.delete(outputPath, true);
            }
        } else {
            Path dbLocationPath = new Path(options.getDbLocation());
            if (!fs.exists(dbLocationPath)) {
                fs.mkdirs(dbLocationPath);
            }
        }

        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.toUri().toString();
            setLoopLoggerFile(curLoopDirName, 0);

            importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf);
        }

        Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (inputPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        int startLoop = CrawlDirUtils.extractLoopNumber(inputPath);
        int endLoop = startLoop + options.getNumLoops();

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        BaseUrlFilter urlFilter = new DomainUrlFilter(domain);

        // Now we're ready to start looping, since we've got our current settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDir.toUri().toString();
            setLoopLoggerFile(curLoopDirName, curLoop);

            Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter,
                    options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation());
            flow.complete();
            // flow.writeDOT("build/valid-flow.dot");

            // Input for the next round is our current output
            inputPath = curLoopDir;
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
    JDBCTapFactory.shutdown();
}

From source file:bixo.examples.SimpleCrawlTool.java

License:Open Source License

public static void main(String[] args) {
    SimpleCrawlToolOptions options = new SimpleCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {// ww  w .  j av a 2  s. c  o m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    if (domain.startsWith("http")) {
        System.err.println(
                "The target domain should be specified as just the host, without the http protocol: " + domain);
        printUsageAndExit(parser);
    }

    if (!domain.equals("localhost") && (domain.split("\\.").length < 2)) {
        System.err.println(
                "The target domain should be a valid paid-level domain or subdomain of the same: " + domain);
        printUsageAndExit(parser);
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user isn't starting from scratch then set up the 
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /urls subdir
            // In the /urls dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.toUri().toString();
            setLoopLoggerFile(curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);
            importOneDomain(domain, crawlDbPath, conf);
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        // It is a good idea to set up a crawl duration when running long crawls as you may 
        // end up in situations where the fetch slows down due to a 'long tail' and by 
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != SimpleCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to 
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = new DomainUrlFilter(domain);

        // OK, now we're ready to start looping, since we've got our current settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.toUri().toString();
            setLoopLoggerFile(curLoopDirName, curLoop);

            Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                    urlFilter, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            //              flow.writeDOT("build/valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.examples.webmining.DemoWebMiningTool.java

License:Apache License

public static void main(String[] args) throws IOException {

    DemoWebMiningOptions options = new DemoWebMiningOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*from ww  w.ja  v  a  2 s .c  om*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Build and run the flow.

    try {

        Path workingDirPath = new Path(options.getWorkingDir());

        JobConf conf = new JobConf();
        FileSystem fs = workingDirPath.getFileSystem(conf);
        setupWorkingDir(fs, workingDirPath, CrawlConfig.SEED_URLS_FILENAME);

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
        if (latestDirPath == null) {
            error("No previous cycle output dirs exist in " + workingDirPath, parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy fetcherPolicy = new FetcherPolicy();
        fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        // We only care about mime types that the Tika HTML parser can handle,
        // so restrict it to the same.
        Set<String> validMimeTypes = new HashSet<String>();
        Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
        for (MediaType supportedType : supportedTypes) {
            validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
        }
        fetcherPolicy.setValidMimeTypes(validMimeTypes);

        // Let's limit our crawl to two loops 
        for (int curLoop = 1; curLoop <= 2; curLoop++) {
            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop);
            Flow flow = DemoWebMiningWorkflow.createWebMiningWorkflow(crawlDbPath, curLoopDirPath,
                    fetcherPolicy, userAgent, options);
            flow.complete();

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }

    } catch (Exception e) {
        System.err.println("Exception running job: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.examples.webmining.WebMiningTool.java

License:Apache License

public static void main(String[] args) throws IOException {

    WebMiningOptions options = new WebMiningOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {//from  w  w w .j  a va 2s .  co m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Build and run the flow.

    try {

        Path workingDirPath = new Path(options.getWorkingDir());

        JobConf conf = new JobConf();
        FileSystem fs = workingDirPath.getFileSystem(conf);
        setupWorkingDir(fs, workingDirPath, CrawlConfig.SEED_URLS_FILENAME);

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
        if (latestDirPath == null) {
            error("No previous cycle output dirs exist in " + workingDirPath, parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy fetcherPolicy = new FetcherPolicy();
        fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        // We only care about mime types that the Tika HTML parser can handle,
        // so restrict it to the same.
        Set<String> validMimeTypes = new HashSet<String>();
        Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
        for (MediaType supportedType : supportedTypes) {
            validMimeTypes.add(String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
        }
        fetcherPolicy.setValidMimeTypes(validMimeTypes);

        // Let's limit our crawl to two loops 
        for (int curLoop = 1; curLoop <= 2; curLoop++) {
            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop);
            Flow flow = WebMiningWorkflow.createWebMiningWorkflow(crawlDbPath, curLoopDirPath, fetcherPolicy,
                    userAgent, options, curLoop == 1);
            flow.complete();

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }

    } catch (Exception e) {
        System.err.println("Exception running job: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.fetcher.FetcherTest.java

License:Open Source License

@Test
public void testStaleConnection() throws Exception {
    System.setProperty("bixo.root.level", "TRACE");

    String workingFolder = "build/it/FetcherTest/testStaleConnection/working";
    String inputPath = makeCrawlDb(workingFolder, "src/it/resources/apple-pages.txt");
    Lfs in = new Lfs(new SequenceFile(UrlDatum.FIELDS), inputPath, true);
    String outPath = "build/it/FetcherTest/testStaleConnection/out";
    Lfs content = new Lfs(new SequenceFile(FetchedDatum.FIELDS), outPath + "/content", true);
    Lfs status = new Lfs(new SequenceFile(StatusDatum.FIELDS), outPath + "/status", true);

    Pipe pipe = new Pipe("urlSource");

    UserAgent userAgent = new FirefoxUserAgent();
    FetcherPolicy fetcherPolicy = new FetcherPolicy();
    fetcherPolicy.setMaxRequestsPerConnection(1);
    fetcherPolicy.setCrawlDelay(5 * 1000L);
    BaseFetcher fetcher = new SimpleHttpFetcher(2, fetcherPolicy, userAgent);
    BaseScoreGenerator scorer = new FixedScoreGenerator();
    FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);

    FlowConnector flowConnector = new FlowConnector();

    Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
    flow.complete();/* w w  w.  ja va 2 s . c o  m*/

    // Test for all valid fetches.
    Lfs validate = new Lfs(new SequenceFile(StatusDatum.FIELDS), outPath + "/status");
    TupleEntryIterator tupleEntryIterator = validate.openForRead(new JobConf());
    while (tupleEntryIterator.hasNext()) {
        TupleEntry entry = tupleEntryIterator.next();
        StatusDatum sd = new StatusDatum(entry);
        if (sd.getStatus() != UrlStatus.FETCHED) {
            LOGGER.error(String.format("Fetched failed! Status is %s for %s", sd.getStatus(), sd.getUrl()));
            BaseFetchException e = sd.getException();
            if (e != null) {
                LOGGER.error("Fetched failed due to exception", e);
            }

            Assert.fail("Status not equal to FETCHED");
        }
    }
}

From source file:bixo.fetcher.FetcherTest.java

License:Open Source License

@Test
public void testRunFetcher() throws Exception {
    System.setProperty("bixo.root.level", "TRACE");

    String workingFolder = "build/test-it/FetcherTest/testRunFetcher";
    String inputPath = makeCrawlDb(workingFolder, "src/it/resources/top10urls.txt");
    Lfs in = new Lfs(new SequenceFile(UrlDatum.FIELDS), inputPath, true);
    Lfs content = new Lfs(new SequenceFile(FetchedDatum.FIELDS), workingFolder + "/content", true);
    Lfs status = new Lfs(new TextLine(), workingFolder + "/status", true);

    Pipe pipe = new Pipe("urlSource");

    UserAgent userAgent = new FirefoxUserAgent();
    BaseFetcher fetcher = new SimpleHttpFetcher(10, userAgent);
    BaseScoreGenerator scorer = new FixedScoreGenerator();
    FetchPipe fetchPipe = new FetchPipe(pipe, scorer, fetcher, 1);

    FlowConnector flowConnector = new FlowConnector();

    Flow flow = flowConnector.connect(in, FetchPipe.makeSinkMap(status, content), fetchPipe);
    flow.complete();/*from  ww w .  ja v  a 2s . c  o m*/

    // Test for 10 good fetches.
    Lfs validate = new Lfs(new SequenceFile(FetchedDatum.FIELDS), workingFolder + "/content");
    TupleEntryIterator tupleEntryIterator = validate.openForRead(new JobConf());
    int fetchedPages = 0;
    while (tupleEntryIterator.hasNext()) {
        TupleEntry entry = tupleEntryIterator.next();
        new FetchedDatum(entry);
        fetchedPages += 1;
    }

    Assert.assertEquals(10, fetchedPages);
}

From source file:cascading.avro.AvroSchemeTest.java

License:Apache License

@Test
public void listOrMapInsideListTest() throws Exception {
    final Schema schema = new Schema.Parser().parse(getClass().getResourceAsStream("test4.avsc"));
    final AvroScheme scheme = new AvroScheme(schema);

    final Fields fields = new Fields("aListOfListOfInt", "aListOfMapToLong");

    final Lfs lfs = new Lfs(scheme, tempDir.getRoot().toString());
    HadoopFlowProcess writeProcess = new HadoopFlowProcess(new JobConf());
    final TupleEntryCollector collector = lfs.openForWrite(writeProcess);

    List<Map<String, Long>> aListOfMapToLong = new ArrayList<Map<String, Long>>();
    Map<String, Long> aMapToLong = new HashMap<String, Long>();
    aMapToLong.put("one", 1L);
    aMapToLong.put("two", 2L);
    aListOfMapToLong.add(aMapToLong);/*from w  w  w.  j  a v a2  s  .c o m*/

    List<List<Integer>> aListOfListOfInt = new ArrayList<List<Integer>>();
    List<Integer> aListOfInt = new LinkedList<Integer>();
    aListOfInt.add(0);
    aListOfInt.add(1);
    aListOfListOfInt.add(aListOfInt);

    write(scheme, collector, new TupleEntry(fields, new Tuple(aListOfListOfInt, aListOfMapToLong)));
    collector.close();

    HadoopFlowProcess readProcess = new HadoopFlowProcess(new JobConf());
    final TupleEntryIterator iterator = lfs.openForRead(readProcess);
    assertTrue(iterator.hasNext());
    final TupleEntry readEntry1 = iterator.next();

    List<Integer> outListOfInt = (List) ((List) readEntry1.getObject("aListOfListOfInt")).get(0);
    Map<Utf8, Long> outMapToLong = (Map) ((List) readEntry1.getObject("aListOfMapToLong")).get(0);

    assertEquals(Integer.valueOf(0), outListOfInt.get(0));
    assertEquals(Integer.valueOf(1), outListOfInt.get(1));
    assertEquals(Long.valueOf(1L), outMapToLong.get("one"));
    assertEquals(Long.valueOf(2L), outMapToLong.get("two"));
    assertTrue(!iterator.hasNext());

}