Example usage for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf()

Source Link

Document

Construct a map/reduce job configuration.

Usage

From source file:azkaban.jobtype.connectors.TeradataToHdfsJobRunnerMain.java

License:Apache License

private void runCopyTdToHdfs() throws IOException {
    if (Boolean.valueOf(_jobProps.getProperty("force.output.overwrite", "false").trim())) {
        Path path = new Path(_jobProps.getProperty(TdchConstants.TARGET_HDFS_PATH_KEY));
        _logger.info("Deleting output directory " + path.toUri());
        JobConf conf = new JobConf();
        path.getFileSystem(conf).delete(path, true);
    }//from   w  ww  .  j  a va2s.c  o  m

    _logger.info(String.format("Executing %s with params: %s",
            TeradataToHdfsJobRunnerMain.class.getSimpleName(), _params));
    TeradataImportTool.main(_params.toTdchParams());
}

From source file:azkaban.jobtype.javautils.AbstractHadoopJob.java

License:Apache License

@SuppressWarnings("rawtypes")
public JobConf createJobConf(Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass)
        throws IOException, URISyntaxException {
    JobConf conf = new JobConf();
    // set custom class loader with custom find resource strategy.

    conf.setJobName(getJobName());/*from  w  ww.j av a 2s.c om*/
    conf.setMapperClass(mapperClass);
    if (reducerClass != null) {
        conf.setReducerClass(reducerClass);
    }

    if (props.getBoolean("is.local", false)) {
        conf.set("mapred.job.tracker", "local");
        conf.set("fs.default.name", "file:///");
        conf.set("mapred.local.dir", "/tmp/map-red");

        logger.info("Running locally, no hadoop jar set.");
    } else {
        HadoopUtils.setClassLoaderAndJar(conf, getClass());
        logger.info("Setting hadoop jar file for class:" + getClass() + "  to " + conf.getJar());
        logger.info("*************************************************************************");
        logger.info(
                "          Running on Real Hadoop Cluster(" + conf.get("mapred.job.tracker") + ")           ");
        logger.info("*************************************************************************");
    }

    // set JVM options if present
    if (props.containsKey("mapred.child.java.opts")) {
        conf.set("mapred.child.java.opts", props.getString("mapred.child.java.opts"));
        logger.info("mapred.child.java.opts set to " + props.getString("mapred.child.java.opts"));
    }

    // set input and output paths if they are present
    if (props.containsKey("input.paths")) {
        List<String> inputPaths = props.getStringList("input.paths");
        if (inputPaths.size() == 0)
            throw new IllegalArgumentException("Must specify at least one value for property 'input.paths'");
        for (String path : inputPaths) {
            HadoopUtils.addAllSubPaths(conf, new Path(path));
        }
    }

    if (props.containsKey("output.path")) {
        String location = props.get("output.path");
        FileOutputFormat.setOutputPath(conf, new Path(location));

        // For testing purpose only remove output file if exists
        if (props.getBoolean("force.output.overwrite", false)) {
            FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf);
            fs.delete(FileOutputFormat.getOutputPath(conf), true);
        }
    }

    // Adds External jars to hadoop classpath
    String externalJarList = props.getString("hadoop.external.jarFiles", null);
    if (externalJarList != null) {
        FileSystem fs = FileSystem.get(conf);
        String[] jarFiles = externalJarList.split(",");
        for (String jarFile : jarFiles) {
            logger.info("Adding extenral jar File:" + jarFile);
            DistributedCache.addFileToClassPath(new Path(jarFile), conf, fs);
        }
    }

    // Adds distributed cache files
    String cacheFileList = props.getString("hadoop.cache.files", null);
    if (cacheFileList != null) {
        String[] cacheFiles = cacheFileList.split(",");
        for (String cacheFile : cacheFiles) {
            logger.info("Adding Distributed Cache File:" + cacheFile);
            DistributedCache.addCacheFile(new URI(cacheFile), conf);
        }
    }

    // Adds distributed cache files
    String archiveFileList = props.getString("hadoop.cache.archives", null);
    if (archiveFileList != null) {
        String[] archiveFiles = archiveFileList.split(",");
        for (String archiveFile : archiveFiles) {
            logger.info("Adding Distributed Cache Archive File:" + archiveFile);
            DistributedCache.addCacheArchive(new URI(archiveFile), conf);
        }
    }

    String hadoopCacheJarDir = props.getString("hdfs.default.classpath.dir", null);
    if (hadoopCacheJarDir != null) {
        FileSystem fs = FileSystem.get(conf);
        if (fs != null) {
            FileStatus[] status = fs.listStatus(new Path(hadoopCacheJarDir));

            if (status != null) {
                for (int i = 0; i < status.length; ++i) {
                    if (!status[i].isDir()) {
                        Path path = new Path(hadoopCacheJarDir, status[i].getPath().getName());
                        logger.info("Adding Jar to Distributed Cache Archive File:" + path);

                        DistributedCache.addFileToClassPath(path, conf, fs);
                    }
                }
            } else {
                logger.info("hdfs.default.classpath.dir " + hadoopCacheJarDir + " is empty.");
            }
        } else {
            logger.info("hdfs.default.classpath.dir " + hadoopCacheJarDir + " filesystem doesn't exist");
        }
    }

    for (String key : getProps().getKeySet()) {
        String lowerCase = key.toLowerCase();
        if (lowerCase.startsWith(HADOOP_PREFIX)) {
            String newKey = key.substring(HADOOP_PREFIX.length());
            conf.set(newKey, getProps().get(key));
        }
    }

    HadoopUtils.setPropsInJob(conf, getProps());

    // put in tokens
    if (System.getenv(HADOOP_TOKEN_FILE_LOCATION) != null) {
        conf.set(MAPREDUCE_JOB_CREDENTIALS_BINARY, System.getenv(HADOOP_TOKEN_FILE_LOCATION));
    }

    return conf;
}

From source file:azkaban.security.HadoopSecurityManager_H_1_0.java

License:Apache License

@Override
public synchronized void prefetchToken(final File tokenFile, final Props props, final Logger logger)
        throws HadoopSecurityManagerException {

    final String userToProxy = props.getString(USER_TO_PROXY);

    logger.info("Getting hadoop tokens for " + userToProxy);

    final Credentials cred = new Credentials();

    if (props.getBoolean(OBTAIN_HCAT_TOKEN, false)) {
        try {/*from  w  ww .  j  av  a2s  . co  m*/
            logger.info("Pre-fetching Hive MetaStore token from hive");

            HiveConf hiveConf = new HiveConf();
            logger.info("HiveConf.ConfVars.METASTOREURIS.varname "
                    + hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname));
            logger.info("HIVE_METASTORE_SASL_ENABLED " + hiveConf.get(HIVE_METASTORE_SASL_ENABLED));
            logger.info("HIVE_METASTORE_KERBEROS_PRINCIPAL " + hiveConf.get(HIVE_METASTORE_KERBEROS_PRINCIPAL));
            logger.info("HIVE_METASTORE_LOCAL " + hiveConf.get(HIVE_METASTORE_LOCAL));

            HiveMetaStoreClient hiveClient = new HiveMetaStoreClient(hiveConf);
            String hcatTokenStr = hiveClient.getDelegationToken(userToProxy,
                    UserGroupInformation.getLoginUser().getShortUserName());
            Token<DelegationTokenIdentifier> hcatToken = new Token<DelegationTokenIdentifier>();
            hcatToken.decodeFromUrlString(hcatTokenStr);
            logger.info("Created hive metastore token: " + hcatTokenStr);
            logger.info("Token kind: " + hcatToken.getKind());
            logger.info("Token id: " + hcatToken.getIdentifier());
            logger.info("Token service: " + hcatToken.getService());
            cred.addToken(hcatToken.getService(), hcatToken);
        } catch (Exception e) {
            e.printStackTrace();
            logger.error("Failed to get hive metastore token." + e.getMessage() + e.getCause());
        } catch (Throwable t) {
            t.printStackTrace();
            logger.error("Failed to get hive metastore token." + t.getMessage() + t.getCause());
        }
    }

    try {
        getProxiedUser(userToProxy).doAs(new PrivilegedExceptionAction<Void>() {
            @Override
            public Void run() throws Exception {
                getToken(userToProxy);
                return null;
            }

            private void getToken(String userToProxy)
                    throws InterruptedException, IOException, HadoopSecurityManagerException {
                logger.info("Here is the props for " + OBTAIN_NAMENODE_TOKEN + ": "
                        + props.getBoolean(OBTAIN_NAMENODE_TOKEN));
                if (props.getBoolean(OBTAIN_NAMENODE_TOKEN, false)) {
                    FileSystem fs = FileSystem.get(conf);
                    // check if we get the correct FS, and most importantly, the
                    // conf
                    logger.info("Getting DFS token from " + fs.getUri());
                    Token<?> fsToken = fs.getDelegationToken(userToProxy);
                    if (fsToken == null) {
                        logger.error("Failed to fetch DFS token for ");
                        throw new HadoopSecurityManagerException(
                                "Failed to fetch DFS token for " + userToProxy);
                    }
                    logger.info("Created DFS token: " + fsToken.toString());
                    logger.info("Token kind: " + fsToken.getKind());
                    logger.info("Token id: " + fsToken.getIdentifier());
                    logger.info("Token service: " + fsToken.getService());
                    cred.addToken(fsToken.getService(), fsToken);
                }

                if (props.getBoolean(OBTAIN_JOBTRACKER_TOKEN, false)) {
                    JobClient jobClient = new JobClient(new JobConf());
                    logger.info("Pre-fetching JT token from JobTracker");

                    Token<DelegationTokenIdentifier> mrdt = jobClient.getDelegationToken(new Text("mr token"));
                    if (mrdt == null) {
                        logger.error("Failed to fetch JT token");
                        throw new HadoopSecurityManagerException("Failed to fetch JT token for " + userToProxy);
                    }
                    logger.info("Created JT token: " + mrdt.toString());
                    logger.info("Token kind: " + mrdt.getKind());
                    logger.info("Token id: " + mrdt.getIdentifier());
                    logger.info("Token service: " + mrdt.getService());
                    cred.addToken(mrdt.getService(), mrdt);
                }
            }
        });

        FileOutputStream fos = null;
        DataOutputStream dos = null;
        try {
            fos = new FileOutputStream(tokenFile);
            dos = new DataOutputStream(fos);
            cred.writeTokenStorageToStream(dos);
        } finally {
            if (dos != null) {
                dos.close();
            }
            if (fos != null) {
                fos.close();
            }
        }

        // stash them to cancel after use.
        logger.info("Tokens loaded in " + tokenFile.getAbsolutePath());

    } catch (Exception e) {
        e.printStackTrace();
        throw new HadoopSecurityManagerException(
                "Failed to get hadoop tokens! " + e.getMessage() + e.getCause());
    } catch (Throwable t) {
        t.printStackTrace();
        throw new HadoopSecurityManagerException(
                "Failed to get hadoop tokens! " + t.getMessage() + t.getCause());
    }
}

From source file:azkaban.security.HadoopSecurityManager_H_2_0.java

License:Apache License

@Override
public synchronized void prefetchToken(final File tokenFile, final Props props, final Logger logger)
        throws HadoopSecurityManagerException {

    final String userToProxy = props.getString(USER_TO_PROXY);

    logger.info("Getting hadoop tokens based on props for " + userToProxy);

    final Credentials cred = new Credentials();

    if (props.getBoolean(OBTAIN_HCAT_TOKEN, false)) {
        try {/* w  w  w. j  av  a 2  s.  c  o  m*/

            // first we fetch and save the default hcat token.
            logger.info("Pre-fetching default Hive MetaStore token from hive");

            HiveConf hiveConf = new HiveConf();
            Token<DelegationTokenIdentifier> hcatToken = fetchHcatToken(userToProxy, hiveConf, null, logger);

            cred.addToken(hcatToken.getService(), hcatToken);

            // check and see if user specified the extra hcat locations we need to
            // look at and fetch token.
            final List<String> extraHcatLocations = props.getStringList(EXTRA_HCAT_LOCATION);
            if (Collections.EMPTY_LIST != extraHcatLocations) {
                logger.info("Need to pre-fetch extra metaStore tokens from hive.");

                // start to process the user inputs.
                for (String thriftUrl : extraHcatLocations) {
                    logger.info("Pre-fetching metaStore token from : " + thriftUrl);

                    hiveConf = new HiveConf();
                    hiveConf.set(HiveConf.ConfVars.METASTOREURIS.varname, thriftUrl);
                    hcatToken = fetchHcatToken(userToProxy, hiveConf, thriftUrl, logger);
                    cred.addToken(hcatToken.getService(), hcatToken);
                }

            }

        } catch (Throwable t) {
            String message = "Failed to get hive metastore token." + t.getMessage() + t.getCause();
            logger.error(message, t);
            throw new HadoopSecurityManagerException(message);
        }
    }

    if (props.getBoolean(OBTAIN_JOBHISTORYSERVER_TOKEN, false)) {
        YarnRPC rpc = YarnRPC.create(conf);
        final String serviceAddr = conf.get(JHAdminConfig.MR_HISTORY_ADDRESS);

        logger.debug("Connecting to HistoryServer at: " + serviceAddr);
        HSClientProtocol hsProxy = (HSClientProtocol) rpc.getProxy(HSClientProtocol.class,
                NetUtils.createSocketAddr(serviceAddr), conf);
        logger.info("Pre-fetching JH token from job history server");

        Token<?> jhsdt = null;
        try {
            jhsdt = getDelegationTokenFromHS(hsProxy);
        } catch (Exception e) {
            logger.error("Failed to fetch JH token", e);
            throw new HadoopSecurityManagerException("Failed to fetch JH token for " + userToProxy);
        }

        if (jhsdt == null) {
            logger.error("getDelegationTokenFromHS() returned null");
            throw new HadoopSecurityManagerException("Unable to fetch JH token for " + userToProxy);
        }

        logger.info("Created JH token: " + jhsdt.toString());
        logger.info("Token kind: " + jhsdt.getKind());
        logger.info("Token id: " + jhsdt.getIdentifier());
        logger.info("Token service: " + jhsdt.getService());

        cred.addToken(jhsdt.getService(), jhsdt);
    }

    try {
        getProxiedUser(userToProxy).doAs(new PrivilegedExceptionAction<Void>() {
            @Override
            public Void run() throws Exception {
                getToken(userToProxy);
                return null;
            }

            private void getToken(String userToProxy)
                    throws InterruptedException, IOException, HadoopSecurityManagerException {
                logger.info("Here is the props for " + OBTAIN_NAMENODE_TOKEN + ": "
                        + props.getBoolean(OBTAIN_NAMENODE_TOKEN));
                if (props.getBoolean(OBTAIN_NAMENODE_TOKEN, false)) {
                    FileSystem fs = FileSystem.get(conf);
                    // check if we get the correct FS, and most importantly, the
                    // conf
                    logger.info("Getting DFS token from " + fs.getUri());
                    Token<?> fsToken = fs
                            .getDelegationToken(getMRTokenRenewerInternal(new JobConf()).toString());
                    if (fsToken == null) {
                        logger.error("Failed to fetch DFS token for ");
                        throw new HadoopSecurityManagerException(
                                "Failed to fetch DFS token for " + userToProxy);
                    }
                    logger.info("Created DFS token: " + fsToken.toString());
                    logger.info("Token kind: " + fsToken.getKind());
                    logger.info("Token id: " + fsToken.getIdentifier());
                    logger.info("Token service: " + fsToken.getService());

                    cred.addToken(fsToken.getService(), fsToken);

                    // getting additional name nodes tokens
                    String otherNamenodes = props.get(OTHER_NAMENODES_TO_GET_TOKEN);
                    if ((otherNamenodes != null) && (otherNamenodes.length() > 0)) {
                        logger.info(OTHER_NAMENODES_TO_GET_TOKEN + ": '" + otherNamenodes + "'");
                        String[] nameNodeArr = otherNamenodes.split(",");
                        Path[] ps = new Path[nameNodeArr.length];
                        for (int i = 0; i < ps.length; i++) {
                            ps[i] = new Path(nameNodeArr[i].trim());
                        }
                        TokenCache.obtainTokensForNamenodes(cred, ps, conf);
                        logger.info("Successfully fetched tokens for: " + otherNamenodes);
                    } else {
                        logger.info(OTHER_NAMENODES_TO_GET_TOKEN + " was not configured");
                    }
                }

                if (props.getBoolean(OBTAIN_JOBTRACKER_TOKEN, false)) {
                    JobConf jobConf = new JobConf();
                    JobClient jobClient = new JobClient(jobConf);
                    logger.info("Pre-fetching JT token from JobTracker");

                    Token<DelegationTokenIdentifier> mrdt = jobClient
                            .getDelegationToken(getMRTokenRenewerInternal(jobConf));
                    if (mrdt == null) {
                        logger.error("Failed to fetch JT token");
                        throw new HadoopSecurityManagerException("Failed to fetch JT token for " + userToProxy);
                    }
                    logger.info("Created JT token: " + mrdt.toString());
                    logger.info("Token kind: " + mrdt.getKind());
                    logger.info("Token id: " + mrdt.getIdentifier());
                    logger.info("Token service: " + mrdt.getService());
                    cred.addToken(mrdt.getService(), mrdt);
                }

            }
        });

        FileOutputStream fos = null;
        DataOutputStream dos = null;
        try {
            fos = new FileOutputStream(tokenFile);
            dos = new DataOutputStream(fos);
            cred.writeTokenStorageToStream(dos);
        } finally {
            if (dos != null) {
                try {
                    dos.close();
                } catch (Throwable t) {
                    // best effort
                    logger.error("encountered exception while closing DataOutputStream of the tokenFile", t);
                }
            }
            if (fos != null) {
                fos.close();
            }
        }
        // stash them to cancel after use.

        logger.info("Tokens loaded in " + tokenFile.getAbsolutePath());

    } catch (Exception e) {
        throw new HadoopSecurityManagerException(
                "Failed to get hadoop tokens! " + e.getMessage() + e.getCause(), e);
    } catch (Throwable t) {
        throw new HadoopSecurityManagerException(
                "Failed to get hadoop tokens! " + t.getMessage() + t.getCause(), t);
    }

}

From source file:bixo.config.BixoPlatform.java

License:Apache License

public BixoPlatform(Class applicationJarClass, Platform platform) throws Exception {
    super(applicationJarClass);
    if (platform == Platform.Local) {
        _platform = new LocalPlatform(applicationJarClass);
        setJobPollingInterval(CASCADING_LOCAL_JOB_POLLING_INTERVAL);
    } else {//from  w  ww .ja  v a 2  s  . c o  m
        configureHadoopPlatform(applicationJarClass, new JobConf());
    }
}

From source file:bixo.examples.crawl.CreateUrlDatumFromOutlinksFunctionTest.java

License:Apache License

@SuppressWarnings("unchecked")
@Test/*  w  w  w . j  a v  a 2 s.com*/
public void testOperate() {
    CreateUrlDatumFromOutlinksFunction op = new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(),
            new SimpleUrlValidator());
    HadoopFlowProcess fp = Mockito.mock(HadoopFlowProcess.class);
    Mockito.when(fp.getJobConf()).thenReturn(new JobConf());

    OperationCall<NullContext> oc = Mockito.mock(OperationCall.class);
    FunctionCall<NullContext> fc = Mockito.mock(FunctionCall.class);
    TupleEntryCollector collector = Mockito.mock(TupleEntryCollector.class);

    Outlink outlink1 = new Outlink("http://bar.com/", "anchorText");
    Outlink outlinks[] = { outlink1 };
    ParsedDatum datum = new ParsedDatum("http://foo.com/", "foo.com", "parsed text", "en", "title", outlinks,
            null);
    datum.setPayloadValue(CrawlDbDatum.CRAWL_DEPTH, 0);

    TupleEntry entry = new TupleEntry(ParsedDatum.FIELDS);
    entry.setTuple(new Tuple(datum.getTuple()));

    Mockito.when(fc.getArguments()).thenReturn(entry);
    Mockito.when(fc.getOutputCollector()).thenReturn(collector);

    op.prepare(fp, oc);
    op.operate(fp, fc);
    op.cleanup(fp, oc);

    Mockito.verify(collector).add(Mockito.argThat(new MatchUrlDatum()));
    Mockito.verifyNoMoreInteractions(collector);

}

From source file:bixo.examples.crawl.DemoCrawlTool.java

License:Apache License

public static void main(String[] args) {
    DemoCrawlToolOptions options = new DemoCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*from  w  ww  .j  ava 2  s .  c o m*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // First check if the user want to clean
        if (options.isCleanOutputDir()) {
            if (fs.exists(outputPath)) {
                fs.delete(outputPath, true);
            }
        }

        // See if the user isn't starting from scratch then set up the 
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
            // In the /crawldb dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        //            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE);
        defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT);
        // this is to cause Bixo to block waiting for next time it can fetch from a particular site.
        // todo: may not be necessary in future versions of Bixo
        //            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE);

        // It is a good idea to set up a crawl duration when running long crawls as you may 
        // end up in situations where the fetch slows down due to a 'long tail' and by 
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != DemoCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                    urlFilter, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            //              flow.writeDOT("build/valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.examples.crawl.DemoCrawlWorkflowLRTest.java

License:Apache License

@Test
public void testNotLosingFetchedUrls() throws Throwable {
    String baseDirName = "build/test/SimpleCrawlWorkflowLRTest/output";
    JobConf conf = new JobConf();
    Path baseDirPath = new Path(baseDirName);
    FileSystem fs = baseDirPath.getFileSystem(conf);

    HadoopUtils.safeRemove(fs, baseDirPath);
    Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 0);
    Path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

    DemoCrawlTool.importOneDomain("localhost:8089", crawlDbPath, conf);
    curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 1);

    FetcherPolicy defaultPolicy = new FetcherPolicy();
    defaultPolicy.setCrawlDelay(1);//from  w ww. ja va  2 s .co  m
    defaultPolicy.setFetcherMode(FetcherMode.COMPLETE);
    BaseUrlFilter urlFilter = new BaseUrlFilter() {

        @Override
        public boolean isRemove(UrlDatum datum) {
            return false;
        }
    };

    DemoCrawlToolOptions options = new DemoCrawlToolOptions();
    options.setUseBoilerpipe(true);
    UserAgent userAgent = new UserAgent("test", "test@domain.com", "http://test.domain.com");
    Server server = null;
    try {
        server = startServer(new FakeWebSiteHandler(), 8089);
        Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                urlFilter, options);
        flow.complete();

        // Update the crawlDb path
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Now we should have an output/1-<timestamp>/ directory, where the
        // /urls dir has 11 entries with
        // one being previously crawled, and the other 10 being pending.

        Hfs crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
        TupleEntryIterator iter = crawldbTap.openForRead(conf);

        int numFetched = 0;
        int numPending = 0;
        while (iter.hasNext()) {
            CrawlDbDatum datum = new CrawlDbDatum(iter.next());
            UrlStatus status = datum.getLastStatus();
            int crawlDepth = datum.getCrawlDepth();
            if (datum.getLastFetched() != 0) {
                numFetched += 1;

                assertEquals(UrlStatus.FETCHED, status);
                assertEquals(0, crawlDepth);
            } else {
                numPending += 1;
                assertEquals(UrlStatus.UNFETCHED, status);
                assertEquals(1, crawlDepth);
            }
        }

        assertEquals(1, numFetched);
        assertEquals(10, numPending);

        // Do it one more time, to verify status gets propagated forward.
        curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 2);

        flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter,
                options);
        flow.complete();
        // Update crawldb path
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
        iter = crawldbTap.openForRead(conf);

        numFetched = 0;
        numPending = 0;
        int numDepth0 = 0;
        int numDepth1 = 0;
        int numDepth2 = 0;
        while (iter.hasNext()) {
            CrawlDbDatum datum = new CrawlDbDatum(iter.next());
            UrlStatus status = datum.getLastStatus();
            int depth = datum.getCrawlDepth();

            if (datum.getLastFetched() != 0) {
                numFetched += 1;
                assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.FETCHED, status);
            } else {
                numPending += 1;
                assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.UNFETCHED, status);
            }

            if (depth == 0) {
                numDepth0 += 1;
            } else if (depth == 1) {
                numDepth1 += 1;
            } else if (depth == 2) {
                numDepth2 += 1;
            } else {
                fail("Invalid crawl depth for " + datum.getUrl());
            }

            // System.out.println(String.format("URL %s has status %s, last fetch %d, and depth %d",
            // datum.getUrl(), datum.getLastStatus(),
            // datum.getLastFetched(), depth));
        }

        assertEquals(11, numFetched);
        assertEquals(100, numPending);

        assertEquals(1, numDepth0);
        assertEquals(10, numDepth1);
        assertEquals(100, numDepth2);
    } catch (Throwable t) {
        fail(t.getMessage());
    } finally {
        if (server != null) {
            server.stop();
        }
    }

}

From source file:bixo.examples.crawl.DemoStatusTool.java

License:Apache License

public static void main(String[] args) {
    DemoStatusToolOptions options = new DemoStatusToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {//w  ww. j ava 2  s. c o  m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    String crawlDirName = options.getWorkingDir();

    try {
        JobConf conf = new JobConf();
        Path crawlDirPath = new Path(crawlDirName);
        FileSystem fs = crawlDirPath.getFileSystem(conf);

        if (!fs.exists(crawlDirPath)) {
            System.err.println("Prior crawl output directory does not exist: " + crawlDirName);
            System.exit(-1);
        }

        // Skip Hadoop/Cascading DEBUG messages.
        Logger.getRootLogger().setLevel(Level.INFO);

        boolean exportDb = options.isExportDb();
        if (exportDb) {
            Path latestCrawlDirPath = CrawlDirUtils.findLatestLoopDir(fs, crawlDirPath);
            processCrawlDb(conf, latestCrawlDirPath, exportDb);
        } else {
            int prevLoop = -1;
            Path curDirPath = null;
            while ((curDirPath = CrawlDirUtils.findNextLoopDir(fs, crawlDirPath, prevLoop)) != null) {
                String curDirName = curDirPath.toUri().toString();
                LOGGER.info("");
                LOGGER.info("================================================================");
                LOGGER.info("Processing " + curDirName);
                LOGGER.info("================================================================");

                int curLoop = CrawlDirUtils.extractLoopNumber(curDirPath);
                if (curLoop != prevLoop + 1) {
                    LOGGER.warn(String.format("Missing directories between %d and %d", prevLoop, curLoop));
                }

                prevLoop = curLoop;

                // Process the status and crawldb in curPath
                processStatus(conf, curDirPath);
                processCrawlDb(conf, curDirPath, exportDb);

            }
        }
    } catch (Throwable t) {
        LOGGER.error("Exception running tool", t);
        System.exit(-1);
    }
}

From source file:bixo.examples.crawl.JDBCCrawlTool.java

License:Apache License

public static void main(String[] args) {
    JDBCCrawlToolOptions options = new JDBCCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*from  w w  w . ja v  a  2s. c om*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    if (domain != null) {
        validateDomain(domain, parser);
    }
    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user is starting from scratch
        if (options.getDbLocation() == null) {
            if (fs.exists(outputPath)) {
                System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName);
                System.out.println("Warning: Delete the output dir before running");
                fs.delete(outputPath, true);
            }
        } else {
            Path dbLocationPath = new Path(options.getDbLocation());
            if (!fs.exists(dbLocationPath)) {
                fs.mkdirs(dbLocationPath);
            }
        }

        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            if (domain == null) {
                System.err.println("For a new crawl the domain needs to be specified" + domain);
                printUsageAndExit(parser);
            }
            importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf);
        }

        Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (inputPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        int startLoop = CrawlDirUtils.extractLoopNumber(inputPath);
        int endLoop = startLoop + options.getNumLoops();

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));

        // Now we're ready to start looping, since we've got our current settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter,
                    options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation());
            flow.complete();
            // flow.writeDOT("build/valid-flow.dot");

            // Input for the next round is our current output
            inputPath = curLoopDir;
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
    JDBCTapFactory.shutdown();
}