Example usage for org.apache.hadoop.mapred JobConf JobConf

List of usage examples for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf() 

Source Link

Document

Construct a map/reduce job configuration.

Usage

From source file:azkaban.jobtype.connectors.TeradataToHdfsJobRunnerMain.java

License:Apache License

private void runCopyTdToHdfs() throws IOException {
    if (Boolean.valueOf(_jobProps.getProperty("force.output.overwrite", "false").trim())) {
        Path path = new Path(_jobProps.getProperty(TdchConstants.TARGET_HDFS_PATH_KEY));
        _logger.info("Deleting output directory " + path.toUri());
        JobConf conf = new JobConf();
        path.getFileSystem(conf).delete(path, true);
    }//from   w  ww  .  j  a va2s.c  o  m

    _logger.info(String.format("Executing %s with params: %s",
            TeradataToHdfsJobRunnerMain.class.getSimpleName(), _params));
    TeradataImportTool.main(_params.toTdchParams());
}

From source file:azkaban.jobtype.javautils.AbstractHadoopJob.java

License:Apache License

@SuppressWarnings("rawtypes")
public JobConf createJobConf(Class<? extends Mapper> mapperClass, Class<? extends Reducer> reducerClass)
        throws IOException, URISyntaxException {
    JobConf conf = new JobConf();
    // set custom class loader with custom find resource strategy.

    conf.setJobName(getJobName());/*from  w  ww.j av a 2s.c om*/
    conf.setMapperClass(mapperClass);
    if (reducerClass != null) {
        conf.setReducerClass(reducerClass);
    }

    if (props.getBoolean("is.local", false)) {
        conf.set("mapred.job.tracker", "local");
        conf.set("fs.default.name", "file:///");
        conf.set("mapred.local.dir", "/tmp/map-red");

        logger.info("Running locally, no hadoop jar set.");
    } else {
        HadoopUtils.setClassLoaderAndJar(conf, getClass());
        logger.info("Setting hadoop jar file for class:" + getClass() + "  to " + conf.getJar());
        logger.info("*************************************************************************");
        logger.info(
                "          Running on Real Hadoop Cluster(" + conf.get("mapred.job.tracker") + ")           ");
        logger.info("*************************************************************************");
    }

    // set JVM options if present
    if (props.containsKey("mapred.child.java.opts")) {
        conf.set("mapred.child.java.opts", props.getString("mapred.child.java.opts"));
        logger.info("mapred.child.java.opts set to " + props.getString("mapred.child.java.opts"));
    }

    // set input and output paths if they are present
    if (props.containsKey("input.paths")) {
        List<String> inputPaths = props.getStringList("input.paths");
        if (inputPaths.size() == 0)
            throw new IllegalArgumentException("Must specify at least one value for property 'input.paths'");
        for (String path : inputPaths) {
            HadoopUtils.addAllSubPaths(conf, new Path(path));
        }
    }

    if (props.containsKey("output.path")) {
        String location = props.get("output.path");
        FileOutputFormat.setOutputPath(conf, new Path(location));

        // For testing purpose only remove output file if exists
        if (props.getBoolean("force.output.overwrite", false)) {
            FileSystem fs = FileOutputFormat.getOutputPath(conf).getFileSystem(conf);
            fs.delete(FileOutputFormat.getOutputPath(conf), true);
        }
    }

    // Adds External jars to hadoop classpath
    String externalJarList = props.getString("hadoop.external.jarFiles", null);
    if (externalJarList != null) {
        FileSystem fs = FileSystem.get(conf);
        String[] jarFiles = externalJarList.split(",");
        for (String jarFile : jarFiles) {
            logger.info("Adding extenral jar File:" + jarFile);
            DistributedCache.addFileToClassPath(new Path(jarFile), conf, fs);
        }
    }

    // Adds distributed cache files
    String cacheFileList = props.getString("hadoop.cache.files", null);
    if (cacheFileList != null) {
        String[] cacheFiles = cacheFileList.split(",");
        for (String cacheFile : cacheFiles) {
            logger.info("Adding Distributed Cache File:" + cacheFile);
            DistributedCache.addCacheFile(new URI(cacheFile), conf);
        }
    }

    // Adds distributed cache files
    String archiveFileList = props.getString("hadoop.cache.archives", null);
    if (archiveFileList != null) {
        String[] archiveFiles = archiveFileList.split(",");
        for (String archiveFile : archiveFiles) {
            logger.info("Adding Distributed Cache Archive File:" + archiveFile);
            DistributedCache.addCacheArchive(new URI(archiveFile), conf);
        }
    }

    String hadoopCacheJarDir = props.getString("hdfs.default.classpath.dir", null);
    if (hadoopCacheJarDir != null) {
        FileSystem fs = FileSystem.get(conf);
        if (fs != null) {
            FileStatus[] status = fs.listStatus(new Path(hadoopCacheJarDir));

            if (status != null) {
                for (int i = 0; i < status.length; ++i) {
                    if (!status[i].isDir()) {
                        Path path = new Path(hadoopCacheJarDir, status[i].getPath().getName());
                        logger.info("Adding Jar to Distributed Cache Archive File:" + path);

                        DistributedCache.addFileToClassPath(path, conf, fs);
                    }
                }
            } else {
                logger.info("hdfs.default.classpath.dir " + hadoopCacheJarDir + " is empty.");
            }
        } else {
            logger.info("hdfs.default.classpath.dir " + hadoopCacheJarDir + " filesystem doesn't exist");
        }
    }

    for (String key : getProps().getKeySet()) {
        String lowerCase = key.toLowerCase();
        if (lowerCase.startsWith(HADOOP_PREFIX)) {
            String newKey = key.substring(HADOOP_PREFIX.length());
            conf.set(newKey, getProps().get(key));
        }
    }

    HadoopUtils.setPropsInJob(conf, getProps());

    // put in tokens
    if (System.getenv(HADOOP_TOKEN_FILE_LOCATION) != null) {
        conf.set(MAPREDUCE_JOB_CREDENTIALS_BINARY, System.getenv(HADOOP_TOKEN_FILE_LOCATION));
    }

    return conf;
}

From source file:azkaban.security.HadoopSecurityManager_H_1_0.java

License:Apache License

@Override
public synchronized void prefetchToken(final File tokenFile, final Props props, final Logger logger)
        throws HadoopSecurityManagerException {

    final String userToProxy = props.getString(USER_TO_PROXY);

    logger.info("Getting hadoop tokens for " + userToProxy);

    final Credentials cred = new Credentials();

    if (props.getBoolean(OBTAIN_HCAT_TOKEN, false)) {
        try {/*from  w  ww .  j  av  a2s  . co  m*/
            logger.info("Pre-fetching Hive MetaStore token from hive");

            HiveConf hiveConf = new HiveConf();
            logger.info("HiveConf.ConfVars.METASTOREURIS.varname "
                    + hiveConf.get(HiveConf.ConfVars.METASTOREURIS.varname));
            logger.info("HIVE_METASTORE_SASL_ENABLED " + hiveConf.get(HIVE_METASTORE_SASL_ENABLED));
            logger.info("HIVE_METASTORE_KERBEROS_PRINCIPAL " + hiveConf.get(HIVE_METASTORE_KERBEROS_PRINCIPAL));
            logger.info("HIVE_METASTORE_LOCAL " + hiveConf.get(HIVE_METASTORE_LOCAL));

            HiveMetaStoreClient hiveClient = new HiveMetaStoreClient(hiveConf);
            String hcatTokenStr = hiveClient.getDelegationToken(userToProxy,
                    UserGroupInformation.getLoginUser().getShortUserName());
            Token<DelegationTokenIdentifier> hcatToken = new Token<DelegationTokenIdentifier>();
            hcatToken.decodeFromUrlString(hcatTokenStr);
            logger.info("Created hive metastore token: " + hcatTokenStr);
            logger.info("Token kind: " + hcatToken.getKind());
            logger.info("Token id: " + hcatToken.getIdentifier());
            logger.info("Token service: " + hcatToken.getService());
            cred.addToken(hcatToken.getService(), hcatToken);
        } catch (Exception e) {
            e.printStackTrace();
            logger.error("Failed to get hive metastore token." + e.getMessage() + e.getCause());
        } catch (Throwable t) {
            t.printStackTrace();
            logger.error("Failed to get hive metastore token." + t.getMessage() + t.getCause());
        }
    }

    try {
        getProxiedUser(userToProxy).doAs(new PrivilegedExceptionAction<Void>() {
            @Override
            public Void run() throws Exception {
                getToken(userToProxy);
                return null;
            }

            private void getToken(String userToProxy)
                    throws InterruptedException, IOException, HadoopSecurityManagerException {
                logger.info("Here is the props for " + OBTAIN_NAMENODE_TOKEN + ": "
                        + props.getBoolean(OBTAIN_NAMENODE_TOKEN));
                if (props.getBoolean(OBTAIN_NAMENODE_TOKEN, false)) {
                    FileSystem fs = FileSystem.get(conf);
                    // check if we get the correct FS, and most importantly, the
                    // conf
                    logger.info("Getting DFS token from " + fs.getUri());
                    Token<?> fsToken = fs.getDelegationToken(userToProxy);
                    if (fsToken == null) {
                        logger.error("Failed to fetch DFS token for ");
                        throw new HadoopSecurityManagerException(
                                "Failed to fetch DFS token for " + userToProxy);
                    }
                    logger.info("Created DFS token: " + fsToken.toString());
                    logger.info("Token kind: " + fsToken.getKind());
                    logger.info("Token id: " + fsToken.getIdentifier());
                    logger.info("Token service: " + fsToken.getService());
                    cred.addToken(fsToken.getService(), fsToken);
                }

                if (props.getBoolean(OBTAIN_JOBTRACKER_TOKEN, false)) {
                    JobClient jobClient = new JobClient(new JobConf());
                    logger.info("Pre-fetching JT token from JobTracker");

                    Token<DelegationTokenIdentifier> mrdt = jobClient.getDelegationToken(new Text("mr token"));
                    if (mrdt == null) {
                        logger.error("Failed to fetch JT token");
                        throw new HadoopSecurityManagerException("Failed to fetch JT token for " + userToProxy);
                    }
                    logger.info("Created JT token: " + mrdt.toString());
                    logger.info("Token kind: " + mrdt.getKind());
                    logger.info("Token id: " + mrdt.getIdentifier());
                    logger.info("Token service: " + mrdt.getService());
                    cred.addToken(mrdt.getService(), mrdt);
                }
            }
        });

        FileOutputStream fos = null;
        DataOutputStream dos = null;
        try {
            fos = new FileOutputStream(tokenFile);
            dos = new DataOutputStream(fos);
            cred.writeTokenStorageToStream(dos);
        } finally {
            if (dos != null) {
                dos.close();
            }
            if (fos != null) {
                fos.close();
            }
        }

        // stash them to cancel after use.
        logger.info("Tokens loaded in " + tokenFile.getAbsolutePath());

    } catch (Exception e) {
        e.printStackTrace();
        throw new HadoopSecurityManagerException(
                "Failed to get hadoop tokens! " + e.getMessage() + e.getCause());
    } catch (Throwable t) {
        t.printStackTrace();
        throw new HadoopSecurityManagerException(
                "Failed to get hadoop tokens! " + t.getMessage() + t.getCause());
    }
}

From source file:azkaban.security.HadoopSecurityManager_H_2_0.java

License:Apache License

@Override
public synchronized void prefetchToken(final File tokenFile, final Props props, final Logger logger)
        throws HadoopSecurityManagerException {

    final String userToProxy = props.getString(USER_TO_PROXY);

    logger.info("Getting hadoop tokens based on props for " + userToProxy);

    final Credentials cred = new Credentials();

    if (props.getBoolean(OBTAIN_HCAT_TOKEN, false)) {
        try {/* w  w  w. j  av  a 2  s.  c  o  m*/

            // first we fetch and save the default hcat token.
            logger.info("Pre-fetching default Hive MetaStore token from hive");

            HiveConf hiveConf = new HiveConf();
            Token<DelegationTokenIdentifier> hcatToken = fetchHcatToken(userToProxy, hiveConf, null, logger);

            cred.addToken(hcatToken.getService(), hcatToken);

            // check and see if user specified the extra hcat locations we need to
            // look at and fetch token.
            final List<String> extraHcatLocations = props.getStringList(EXTRA_HCAT_LOCATION);
            if (Collections.EMPTY_LIST != extraHcatLocations) {
                logger.info("Need to pre-fetch extra metaStore tokens from hive.");

                // start to process the user inputs.
                for (String thriftUrl : extraHcatLocations) {
                    logger.info("Pre-fetching metaStore token from : " + thriftUrl);

                    hiveConf = new HiveConf();
                    hiveConf.set(HiveConf.ConfVars.METASTOREURIS.varname, thriftUrl);
                    hcatToken = fetchHcatToken(userToProxy, hiveConf, thriftUrl, logger);
                    cred.addToken(hcatToken.getService(), hcatToken);
                }

            }

        } catch (Throwable t) {
            String message = "Failed to get hive metastore token." + t.getMessage() + t.getCause();
            logger.error(message, t);
            throw new HadoopSecurityManagerException(message);
        }
    }

    if (props.getBoolean(OBTAIN_JOBHISTORYSERVER_TOKEN, false)) {
        YarnRPC rpc = YarnRPC.create(conf);
        final String serviceAddr = conf.get(JHAdminConfig.MR_HISTORY_ADDRESS);

        logger.debug("Connecting to HistoryServer at: " + serviceAddr);
        HSClientProtocol hsProxy = (HSClientProtocol) rpc.getProxy(HSClientProtocol.class,
                NetUtils.createSocketAddr(serviceAddr), conf);
        logger.info("Pre-fetching JH token from job history server");

        Token<?> jhsdt = null;
        try {
            jhsdt = getDelegationTokenFromHS(hsProxy);
        } catch (Exception e) {
            logger.error("Failed to fetch JH token", e);
            throw new HadoopSecurityManagerException("Failed to fetch JH token for " + userToProxy);
        }

        if (jhsdt == null) {
            logger.error("getDelegationTokenFromHS() returned null");
            throw new HadoopSecurityManagerException("Unable to fetch JH token for " + userToProxy);
        }

        logger.info("Created JH token: " + jhsdt.toString());
        logger.info("Token kind: " + jhsdt.getKind());
        logger.info("Token id: " + jhsdt.getIdentifier());
        logger.info("Token service: " + jhsdt.getService());

        cred.addToken(jhsdt.getService(), jhsdt);
    }

    try {
        getProxiedUser(userToProxy).doAs(new PrivilegedExceptionAction<Void>() {
            @Override
            public Void run() throws Exception {
                getToken(userToProxy);
                return null;
            }

            private void getToken(String userToProxy)
                    throws InterruptedException, IOException, HadoopSecurityManagerException {
                logger.info("Here is the props for " + OBTAIN_NAMENODE_TOKEN + ": "
                        + props.getBoolean(OBTAIN_NAMENODE_TOKEN));
                if (props.getBoolean(OBTAIN_NAMENODE_TOKEN, false)) {
                    FileSystem fs = FileSystem.get(conf);
                    // check if we get the correct FS, and most importantly, the
                    // conf
                    logger.info("Getting DFS token from " + fs.getUri());
                    Token<?> fsToken = fs
                            .getDelegationToken(getMRTokenRenewerInternal(new JobConf()).toString());
                    if (fsToken == null) {
                        logger.error("Failed to fetch DFS token for ");
                        throw new HadoopSecurityManagerException(
                                "Failed to fetch DFS token for " + userToProxy);
                    }
                    logger.info("Created DFS token: " + fsToken.toString());
                    logger.info("Token kind: " + fsToken.getKind());
                    logger.info("Token id: " + fsToken.getIdentifier());
                    logger.info("Token service: " + fsToken.getService());

                    cred.addToken(fsToken.getService(), fsToken);

                    // getting additional name nodes tokens
                    String otherNamenodes = props.get(OTHER_NAMENODES_TO_GET_TOKEN);
                    if ((otherNamenodes != null) && (otherNamenodes.length() > 0)) {
                        logger.info(OTHER_NAMENODES_TO_GET_TOKEN + ": '" + otherNamenodes + "'");
                        String[] nameNodeArr = otherNamenodes.split(",");
                        Path[] ps = new Path[nameNodeArr.length];
                        for (int i = 0; i < ps.length; i++) {
                            ps[i] = new Path(nameNodeArr[i].trim());
                        }
                        TokenCache.obtainTokensForNamenodes(cred, ps, conf);
                        logger.info("Successfully fetched tokens for: " + otherNamenodes);
                    } else {
                        logger.info(OTHER_NAMENODES_TO_GET_TOKEN + " was not configured");
                    }
                }

                if (props.getBoolean(OBTAIN_JOBTRACKER_TOKEN, false)) {
                    JobConf jobConf = new JobConf();
                    JobClient jobClient = new JobClient(jobConf);
                    logger.info("Pre-fetching JT token from JobTracker");

                    Token<DelegationTokenIdentifier> mrdt = jobClient
                            .getDelegationToken(getMRTokenRenewerInternal(jobConf));
                    if (mrdt == null) {
                        logger.error("Failed to fetch JT token");
                        throw new HadoopSecurityManagerException("Failed to fetch JT token for " + userToProxy);
                    }
                    logger.info("Created JT token: " + mrdt.toString());
                    logger.info("Token kind: " + mrdt.getKind());
                    logger.info("Token id: " + mrdt.getIdentifier());
                    logger.info("Token service: " + mrdt.getService());
                    cred.addToken(mrdt.getService(), mrdt);
                }

            }
        });

        FileOutputStream fos = null;
        DataOutputStream dos = null;
        try {
            fos = new FileOutputStream(tokenFile);
            dos = new DataOutputStream(fos);
            cred.writeTokenStorageToStream(dos);
        } finally {
            if (dos != null) {
                try {
                    dos.close();
                } catch (Throwable t) {
                    // best effort
                    logger.error("encountered exception while closing DataOutputStream of the tokenFile", t);
                }
            }
            if (fos != null) {
                fos.close();
            }
        }
        // stash them to cancel after use.

        logger.info("Tokens loaded in " + tokenFile.getAbsolutePath());

    } catch (Exception e) {
        throw new HadoopSecurityManagerException(
                "Failed to get hadoop tokens! " + e.getMessage() + e.getCause(), e);
    } catch (Throwable t) {
        throw new HadoopSecurityManagerException(
                "Failed to get hadoop tokens! " + t.getMessage() + t.getCause(), t);
    }

}

From source file:bixo.config.BixoPlatform.java

License:Apache License

public BixoPlatform(Class applicationJarClass, Platform platform) throws Exception {
    super(applicationJarClass);
    if (platform == Platform.Local) {
        _platform = new LocalPlatform(applicationJarClass);
        setJobPollingInterval(CASCADING_LOCAL_JOB_POLLING_INTERVAL);
    } else {//from  w  ww .ja  v a 2  s  . c o  m
        configureHadoopPlatform(applicationJarClass, new JobConf());
    }
}

From source file:bixo.examples.crawl.CreateUrlDatumFromOutlinksFunctionTest.java

License:Apache License

@SuppressWarnings("unchecked")
@Test/*  w  w  w . j  a v  a 2 s.com*/
public void testOperate() {
    CreateUrlDatumFromOutlinksFunction op = new CreateUrlDatumFromOutlinksFunction(new SimpleUrlNormalizer(),
            new SimpleUrlValidator());
    HadoopFlowProcess fp = Mockito.mock(HadoopFlowProcess.class);
    Mockito.when(fp.getJobConf()).thenReturn(new JobConf());

    OperationCall<NullContext> oc = Mockito.mock(OperationCall.class);
    FunctionCall<NullContext> fc = Mockito.mock(FunctionCall.class);
    TupleEntryCollector collector = Mockito.mock(TupleEntryCollector.class);

    Outlink outlink1 = new Outlink("http://bar.com/", "anchorText");
    Outlink outlinks[] = { outlink1 };
    ParsedDatum datum = new ParsedDatum("http://foo.com/", "foo.com", "parsed text", "en", "title", outlinks,
            null);
    datum.setPayloadValue(CrawlDbDatum.CRAWL_DEPTH, 0);

    TupleEntry entry = new TupleEntry(ParsedDatum.FIELDS);
    entry.setTuple(new Tuple(datum.getTuple()));

    Mockito.when(fc.getArguments()).thenReturn(entry);
    Mockito.when(fc.getOutputCollector()).thenReturn(collector);

    op.prepare(fp, oc);
    op.operate(fp, fc);
    op.cleanup(fp, oc);

    Mockito.verify(collector).add(Mockito.argThat(new MatchUrlDatum()));
    Mockito.verifyNoMoreInteractions(collector);

}

From source file:bixo.examples.crawl.DemoCrawlTool.java

License:Apache License

public static void main(String[] args) {
    DemoCrawlToolOptions options = new DemoCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*from  w  ww  .j  ava 2  s .  c o m*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // First check if the user want to clean
        if (options.isCleanOutputDir()) {
            if (fs.exists(outputPath)) {
                fs.delete(outputPath, true);
            }
        }

        // See if the user isn't starting from scratch then set up the 
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
            // In the /crawldb dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        //            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE);
        defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT);
        // this is to cause Bixo to block waiting for next time it can fetch from a particular site.
        // todo: may not be necessary in future versions of Bixo
        //            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE);

        // It is a good idea to set up a crawl duration when running long crawls as you may 
        // end up in situations where the fetch slows down due to a 'long tail' and by 
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != DemoCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                    urlFilter, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            //              flow.writeDOT("build/valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.examples.crawl.DemoCrawlWorkflowLRTest.java

License:Apache License

@Test
public void testNotLosingFetchedUrls() throws Throwable {
    String baseDirName = "build/test/SimpleCrawlWorkflowLRTest/output";
    JobConf conf = new JobConf();
    Path baseDirPath = new Path(baseDirName);
    FileSystem fs = baseDirPath.getFileSystem(conf);

    HadoopUtils.safeRemove(fs, baseDirPath);
    Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 0);
    Path crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

    DemoCrawlTool.importOneDomain("localhost:8089", crawlDbPath, conf);
    curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 1);

    FetcherPolicy defaultPolicy = new FetcherPolicy();
    defaultPolicy.setCrawlDelay(1);//from  w ww. ja va  2 s .co  m
    defaultPolicy.setFetcherMode(FetcherMode.COMPLETE);
    BaseUrlFilter urlFilter = new BaseUrlFilter() {

        @Override
        public boolean isRemove(UrlDatum datum) {
            return false;
        }
    };

    DemoCrawlToolOptions options = new DemoCrawlToolOptions();
    options.setUseBoilerpipe(true);
    UserAgent userAgent = new UserAgent("test", "test@domain.com", "http://test.domain.com");
    Server server = null;
    try {
        server = startServer(new FakeWebSiteHandler(), 8089);
        Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                urlFilter, options);
        flow.complete();

        // Update the crawlDb path
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Now we should have an output/1-<timestamp>/ directory, where the
        // /urls dir has 11 entries with
        // one being previously crawled, and the other 10 being pending.

        Hfs crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
        TupleEntryIterator iter = crawldbTap.openForRead(conf);

        int numFetched = 0;
        int numPending = 0;
        while (iter.hasNext()) {
            CrawlDbDatum datum = new CrawlDbDatum(iter.next());
            UrlStatus status = datum.getLastStatus();
            int crawlDepth = datum.getCrawlDepth();
            if (datum.getLastFetched() != 0) {
                numFetched += 1;

                assertEquals(UrlStatus.FETCHED, status);
                assertEquals(0, crawlDepth);
            } else {
                numPending += 1;
                assertEquals(UrlStatus.UNFETCHED, status);
                assertEquals(1, crawlDepth);
            }
        }

        assertEquals(1, numFetched);
        assertEquals(10, numPending);

        // Do it one more time, to verify status gets propagated forward.
        curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, baseDirPath, 2);

        flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter,
                options);
        flow.complete();
        // Update crawldb path
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
        iter = crawldbTap.openForRead(conf);

        numFetched = 0;
        numPending = 0;
        int numDepth0 = 0;
        int numDepth1 = 0;
        int numDepth2 = 0;
        while (iter.hasNext()) {
            CrawlDbDatum datum = new CrawlDbDatum(iter.next());
            UrlStatus status = datum.getLastStatus();
            int depth = datum.getCrawlDepth();

            if (datum.getLastFetched() != 0) {
                numFetched += 1;
                assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.FETCHED, status);
            } else {
                numPending += 1;
                assertEquals("URL has incorrect status: " + datum.getUrl(), UrlStatus.UNFETCHED, status);
            }

            if (depth == 0) {
                numDepth0 += 1;
            } else if (depth == 1) {
                numDepth1 += 1;
            } else if (depth == 2) {
                numDepth2 += 1;
            } else {
                fail("Invalid crawl depth for " + datum.getUrl());
            }

            // System.out.println(String.format("URL %s has status %s, last fetch %d, and depth %d",
            // datum.getUrl(), datum.getLastStatus(),
            // datum.getLastFetched(), depth));
        }

        assertEquals(11, numFetched);
        assertEquals(100, numPending);

        assertEquals(1, numDepth0);
        assertEquals(10, numDepth1);
        assertEquals(100, numDepth2);
    } catch (Throwable t) {
        fail(t.getMessage());
    } finally {
        if (server != null) {
            server.stop();
        }
    }

}

From source file:bixo.examples.crawl.DemoStatusTool.java

License:Apache License

public static void main(String[] args) {
    DemoStatusToolOptions options = new DemoStatusToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {//w  ww. j ava 2  s. c o  m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    String crawlDirName = options.getWorkingDir();

    try {
        JobConf conf = new JobConf();
        Path crawlDirPath = new Path(crawlDirName);
        FileSystem fs = crawlDirPath.getFileSystem(conf);

        if (!fs.exists(crawlDirPath)) {
            System.err.println("Prior crawl output directory does not exist: " + crawlDirName);
            System.exit(-1);
        }

        // Skip Hadoop/Cascading DEBUG messages.
        Logger.getRootLogger().setLevel(Level.INFO);

        boolean exportDb = options.isExportDb();
        if (exportDb) {
            Path latestCrawlDirPath = CrawlDirUtils.findLatestLoopDir(fs, crawlDirPath);
            processCrawlDb(conf, latestCrawlDirPath, exportDb);
        } else {
            int prevLoop = -1;
            Path curDirPath = null;
            while ((curDirPath = CrawlDirUtils.findNextLoopDir(fs, crawlDirPath, prevLoop)) != null) {
                String curDirName = curDirPath.toUri().toString();
                LOGGER.info("");
                LOGGER.info("================================================================");
                LOGGER.info("Processing " + curDirName);
                LOGGER.info("================================================================");

                int curLoop = CrawlDirUtils.extractLoopNumber(curDirPath);
                if (curLoop != prevLoop + 1) {
                    LOGGER.warn(String.format("Missing directories between %d and %d", prevLoop, curLoop));
                }

                prevLoop = curLoop;

                // Process the status and crawldb in curPath
                processStatus(conf, curDirPath);
                processCrawlDb(conf, curDirPath, exportDb);

            }
        }
    } catch (Throwable t) {
        LOGGER.error("Exception running tool", t);
        System.exit(-1);
    }
}

From source file:bixo.examples.crawl.JDBCCrawlTool.java

License:Apache License

public static void main(String[] args) {
    JDBCCrawlToolOptions options = new JDBCCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*from  w w  w . ja v  a  2s. c om*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    if (domain != null) {
        validateDomain(domain, parser);
    }
    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user is starting from scratch
        if (options.getDbLocation() == null) {
            if (fs.exists(outputPath)) {
                System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName);
                System.out.println("Warning: Delete the output dir before running");
                fs.delete(outputPath, true);
            }
        } else {
            Path dbLocationPath = new Path(options.getDbLocation());
            if (!fs.exists(dbLocationPath)) {
                fs.mkdirs(dbLocationPath);
            }
        }

        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            if (domain == null) {
                System.err.println("For a new crawl the domain needs to be specified" + domain);
                printUsageAndExit(parser);
            }
            importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf);
        }

        Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (inputPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        int startLoop = CrawlDirUtils.extractLoopNumber(inputPath);
        int endLoop = startLoop + options.getNumLoops();

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));

        // Now we're ready to start looping, since we've got our current settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter,
                    options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation());
            flow.complete();
            // flow.writeDOT("build/valid-flow.dot");

            // Input for the next round is our current output
            inputPath = curLoopDir;
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
    JDBCTapFactory.shutdown();
}