Example usage for org.apache.hadoop.fs FileSystem get

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem get.

Prototype

public static FileSystem get(Configuration conf) throws IOException

Source Link

Document

Returns the configured FileSystem implementation.

Usage

From source file:cascading.platform.hadoop2.Hadoop2MR1Platform.java

License:Open Source License

@Override
public synchronized void setUp() throws IOException {
    if (configuration != null)
        return;//w ww .j  a v  a  2  s .  c o m

    if (!isUseCluster()) {
        LOG.info("not using cluster");
        configuration = new JobConf();

        // enforce settings to make local mode behave the same across distributions
        configuration.set("fs.defaultFS", "file:///");
        configuration.set("mapreduce.framework.name", "local");
        configuration.set("mapreduce.jobtracker.staging.root.dir",
                System.getProperty("user.dir") + "/" + "build/tmp/cascading/staging");

        String stagingDir = configuration.get("mapreduce.jobtracker.staging.root.dir");

        if (Util.isEmpty(stagingDir))
            configuration.set("mapreduce.jobtracker.staging.root.dir",
                    System.getProperty("user.dir") + "/build/tmp/cascading/staging");

        fileSys = FileSystem.get(configuration);
    } else {
        LOG.info("using cluster");

        if (Util.isEmpty(System.getProperty("hadoop.log.dir")))
            System.setProperty("hadoop.log.dir", "build/test/log");

        if (Util.isEmpty(System.getProperty("hadoop.tmp.dir")))
            System.setProperty("hadoop.tmp.dir", "build/test/tmp");

        new File(System.getProperty("hadoop.log.dir")).mkdirs(); // ignored

        JobConf conf = new JobConf();

        if (!Util.isEmpty(System.getProperty("mapred.jar"))) {
            LOG.info("using a remote cluster with jar: {}", System.getProperty("mapred.jar"));
            configuration = conf;

            ((JobConf) configuration).setJar(System.getProperty("mapred.jar"));

            if (!Util.isEmpty(System.getProperty("fs.default.name"))) {
                LOG.info("using {}={}", "fs.default.name", System.getProperty("fs.default.name"));
                configuration.set("fs.default.name", System.getProperty("fs.default.name"));
            }

            if (!Util.isEmpty(System.getProperty("mapred.job.tracker"))) {
                LOG.info("using {}={}", "mapred.job.tracker", System.getProperty("mapred.job.tracker"));
                configuration.set("mapred.job.tracker", System.getProperty("mapred.job.tracker"));
            }

            if (!Util.isEmpty(System.getProperty("fs.defaultFS"))) {
                LOG.info("using {}={}", "fs.defaultFS", System.getProperty("fs.defaultFS"));
                configuration.set("fs.defaultFS", System.getProperty("fs.defaultFS"));
            }

            if (!Util.isEmpty(System.getProperty("yarn.resourcemanager.address"))) {
                LOG.info("using {}={}", "yarn.resourcemanager.address",
                        System.getProperty("yarn.resourcemanager.address"));
                configuration.set("yarn.resourcemanager.address",
                        System.getProperty("yarn.resourcemanager.address"));
            }

            if (!Util.isEmpty(System.getProperty("mapreduce.jobhistory.address"))) {
                LOG.info("using {}={}", "mapreduce.jobhistory.address",
                        System.getProperty("mapreduce.jobhistory.address"));
                configuration.set("mapreduce.jobhistory.address",
                        System.getProperty("mapreduce.jobhistory.address"));
            }

            configuration.set("mapreduce.user.classpath.first", "true"); // use test dependencies
            configuration.set("mapreduce.framework.name", "yarn");

            fileSys = FileSystem.get(configuration);
        } else {
            conf.setBoolean("yarn.is.minicluster", true);
            //      conf.setInt( "yarn.nodemanager.delete.debug-delay-sec", -1 );
            //      conf.set( "yarn.scheduler.capacity.root.queues", "default" );
            //      conf.set( "yarn.scheduler.capacity.root.default.capacity", "100" );
            // disable blacklisting hosts not to fail localhost during unit tests
            conf.setBoolean("yarn.app.mapreduce.am.job.node-blacklisting.enable", false);

            dfs = new MiniDFSCluster(conf, 4, true, null);
            fileSys = dfs.getFileSystem();

            FileSystem.setDefaultUri(conf, fileSys.getUri());

            mr = MiniMRClientClusterFactory.create(this.getClass(), 4, conf);

            configuration = mr.getConfig();
        }

        configuration.set("mapred.child.java.opts", "-Xmx512m");
        configuration.setInt("mapreduce.job.jvm.numtasks", -1);
        configuration.setInt("mapreduce.client.completion.pollinterval", 50);
        configuration.setInt("mapreduce.client.progressmonitor.pollinterval", 50);
        configuration.setBoolean("mapreduce.map.speculative", false);
        configuration.setBoolean("mapreduce.reduce.speculative", false);
    }

    configuration.setInt("mapreduce.job.maps", numMappers);
    configuration.setInt("mapreduce.job.reduces", numReducers);

    Map<Object, Object> globalProperties = getGlobalProperties();

    if (logger != null)
        globalProperties.put("log4j.logger", logger);

    FlowProps.setJobPollingInterval(globalProperties, 10); // should speed up tests

    Hadoop2MR1Planner.copyProperties(configuration, globalProperties); // copy any external properties

    Hadoop2MR1Planner.copyConfiguration(properties, configuration); // put all properties on the jobconf
}

From source file:cascading.platform.tez.Hadoop2TezPlatform.java

License:Open Source License

@Override
public synchronized void setUp() throws IOException {
    if (configuration != null)
        return;//from  w  ww  . j  a v a 2s  .  c  o  m

    if (!isUseCluster()) {
        // Current usage requirements:
        // 1. Clients need to set "tez.local.mode" to true when creating a TezClient instance. (For the examples this can be done via -Dtez.local.mode=true)
        // 2. fs.defaultFS must be set to "file:///"
        // 2.1 If running examples - this must be set in tez-site.xml (so that it's picked up by the client, as well as the conf instances used to configure the Inputs / Outputs).
        // 2.2 If using programatically (without a tez-site.xml present). All configuration instances used (to crate the client / configure Inputs / Outputs) - must have this property set.
        // 3. tez.runtime.optimize.local.fetch needs to be set to true (either via tez-site.xml or in all configurations used to create the job (similar to fs.defaultFS in step 2))
        // 4. tez.staging-dir must be set (either programatically or via tez-site.xml).
        // Until TEZ-1337 goes in - the staging-dir for the job is effectively the root of the filesystem (and where inputs are read from / written to if relative paths are used).

        LOG.info("not using cluster");
        configuration = new Configuration();

        configuration.setInt(FlowRuntimeProps.GATHER_PARTITIONS, getNumGatherPartitions());
        //      configuration.setInt( FlowRuntimeProps.GATHER_PARTITIONS, 1 ); // deadlocks if larger than 1

        configuration.set(TezConfiguration.TEZ_LOCAL_MODE, "true");
        configuration.set("fs.defaultFS", "file:///");
        configuration.set("tez.runtime.optimize.local.fetch", "true");

        // hack to prevent deadlocks where downstream processors are scheduled before upstream
        configuration.setInt("tez.am.inline.task.execution.max-tasks", 3); // testHashJoinMergeIntoHashJoinAccumulatedAccumulatedMerge fails if set to 2

        configuration.set(TezConfiguration.TEZ_IGNORE_LIB_URIS, "true"); // in local mode, use local classpath
        configuration.setInt(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, -1);
        configuration.set(TezConfiguration.TEZ_GENERATE_DEBUG_ARTIFACTS, "true");

        configuration.set("tez.am.mode.session", "true"); // allows multiple TezClient instances to be used in a single jvm

        if (!Util.isEmpty(System.getProperty("hadoop.tmp.dir")))
            configuration.set("hadoop.tmp.dir", System.getProperty("hadoop.tmp.dir"));
        else
            configuration.set("hadoop.tmp.dir", "build/test/tmp");

        fileSys = FileSystem.get(configuration);
    } else {
        LOG.info("using cluster");

        if (Util.isEmpty(System.getProperty("hadoop.log.dir")))
            System.setProperty("hadoop.log.dir", "build/test/log");

        if (Util.isEmpty(System.getProperty("hadoop.tmp.dir")))
            System.setProperty("hadoop.tmp.dir", "build/test/tmp");

        new File(System.getProperty("hadoop.log.dir")).mkdirs(); // ignored
        new File(System.getProperty("hadoop.tmp.dir")).mkdirs(); // ignored

        Configuration defaultConf = new Configuration();

        defaultConf.setInt(FlowRuntimeProps.GATHER_PARTITIONS, getNumGatherPartitions());

        defaultConf.setInt(YarnConfiguration.DEBUG_NM_DELETE_DELAY_SEC, -1);

        //      defaultConf.set( TezConfiguration.TEZ_AM_LOG_LEVEL, "DEBUG" );
        //      defaultConf.set( TezConfiguration.TEZ_TASK_LOG_LEVEL, "DEBUG" );

        defaultConf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
        defaultConf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false);
        defaultConf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, System.getProperty("hadoop.tmp.dir"));

        miniDFSCluster = new MiniDFSCluster.Builder(defaultConf).numDataNodes(4).format(true).racks(null)
                .build();

        fileSys = miniDFSCluster.getFileSystem();

        Configuration tezConf = new Configuration(defaultConf);
        tezConf.set("fs.defaultFS", fileSys.getUri().toString()); // use HDFS
        tezConf.set(MRJobConfig.MR_AM_STAGING_DIR, "/apps_staging_dir");

        // see MiniTezClusterWithTimeline as alternate
        miniTezCluster = new MiniTezCluster(getClass().getName(), 4, 1, 1); // todo: set to 4
        miniTezCluster.init(tezConf);
        miniTezCluster.start();

        configuration = miniTezCluster.getConfig();

        // stats won't work after completion unless ATS is used
        if (setTimelineStore(configuration)) // true if ats can be loaded and configured for this hadoop version
        {
            configuration.set(TezConfiguration.TEZ_HISTORY_LOGGING_SERVICE_CLASS,
                    ATSHistoryLoggingService.class.getName());
            configuration.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true);
            configuration.set(YarnConfiguration.TIMELINE_SERVICE_ADDRESS, "localhost:10200");
            configuration.set(YarnConfiguration.TIMELINE_SERVICE_WEBAPP_ADDRESS, "localhost:8188");
            configuration.set(YarnConfiguration.TIMELINE_SERVICE_WEBAPP_HTTPS_ADDRESS, "localhost:8190");

            yarnHistoryServer = new ApplicationHistoryServer();
            yarnHistoryServer.init(configuration);
            yarnHistoryServer.start();
        }
    }

    configuration.setInt(TezConfiguration.TEZ_AM_MAX_APP_ATTEMPTS, 1);
    configuration.setInt(TezConfiguration.TEZ_AM_TASK_MAX_FAILED_ATTEMPTS, 1);
    configuration.setInt(TezConfiguration.TEZ_AM_MAX_TASK_FAILURES_PER_NODE, 1);

    Map<Object, Object> globalProperties = getGlobalProperties();

    if (logger != null)
        globalProperties.put("log4j.logger", logger);

    FlowProps.setJobPollingInterval(globalProperties, 10); // should speed up tests

    Hadoop2TezPlanner.copyProperties(configuration, globalProperties); // copy any external properties

    Hadoop2TezPlanner.copyConfiguration(properties, configuration); // put all properties on the jobconf

    ExitUtil.disableSystemExit();

    //    forbidSystemExitCall();
}

From source file:cascading.tap.hadoop.BaseDistCacheTap.java

License:Open Source License

@Override
public TupleEntryIterator openForRead(FlowProcess<? extends Configuration> flowProcess, RecordReader input)
        throws IOException {
    // always read via Hadoop FileSystem if in standalone/local mode, or if an RecordReader is provided
    if (HadoopUtil.isLocal(flowProcess.getConfig()) || input != null) {
        LOG.info("delegating to parent");
        return super.openForRead(flowProcess, input);
    }//  ww  w . j av a2  s.  co m

    Path[] cachedFiles = getLocalCacheFiles(flowProcess);

    if (cachedFiles == null || cachedFiles.length == 0)
        return super.openForRead(flowProcess, null);

    List<Path> paths = new ArrayList<>();
    List<Tap> taps = new ArrayList<>();

    if (isSimpleGlob()) {
        FileSystem fs = FileSystem.get(flowProcess.getConfig());
        FileStatus[] statuses = fs.globStatus(getHfs().getPath());

        for (FileStatus status : statuses)
            paths.add(status.getPath());
    } else {
        paths.add(getHfs().getPath());
    }

    for (Path pathToFind : paths) {
        for (Path path : cachedFiles) {
            if (path.toString().endsWith(pathToFind.getName())) {
                LOG.info("found {} in distributed cache", path);
                taps.add(new Lfs(getScheme(), path.toString()));
            }
        }
    }

    if (paths.isEmpty()) // not in cache, read from HDFS
    {
        LOG.info("could not find files in local resource path. delegating to parent: {}",
                super.getIdentifier());
        return super.openForRead(flowProcess, input);
    }

    return new MultiSourceTap(taps.toArray(new Tap[taps.size()])).openForRead(flowProcess, input);
}

From source file:cascading.tap.hadoop.BaseDistCacheTap.java

License:Open Source License

private void registerHfs(FlowProcess<? extends Configuration> process, Configuration conf, Hfs hfs)
        throws IOException {
    if (isSimpleGlob()) {
        FileSystem fs = FileSystem.get(conf);
        FileStatus[] statuses = fs.globStatus(getHfs().getPath());

        if (statuses == null || statuses.length == 0)
            throw new TapException(String.format(
                    "glob expression %s does not match any files on the filesystem", getHfs().getPath()));

        for (FileStatus fileStatus : statuses)
            registerURI(conf, fileStatus.getPath());
    } else {/*from w  w  w .j  ava  2s.co m*/
        registerURI(conf, hfs.getPath());
    }

    hfs.sourceConfInitComplete(process, conf);
}

From source file:cascading.tap.hadoop.Hfs.java

License:Open Source License

protected FileSystem getDefaultFileSystem(Configuration configuration) {
    try {/*from w ww  .ja va2 s .  c o m*/
        return FileSystem.get(configuration);
    } catch (IOException exception) {
        throw new TapException("unable to get handle to underlying filesystem", exception);
    }
}

From source file:cascading.tap.hadoop.ZipInputFormatTest.java

License:Open Source License

public void testSplits() throws Exception {
    JobConf job = new JobConf();
    FileSystem currentFs = FileSystem.get(job);

    Path file = new Path(workDir, "test.zip");

    Reporter reporter = Reporter.NULL;/*www  .jav a  2  s.  co m*/

    int seed = new Random().nextInt();
    LOG.info("seed = " + seed);
    Random random = new Random(seed);
    FileInputFormat.setInputPaths(job, file);

    for (int entries = 1; entries < MAX_ENTRIES; entries += random.nextInt(MAX_ENTRIES / 10) + 1) {
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        ZipOutputStream zos = new ZipOutputStream(byteArrayOutputStream);
        long length = 0;

        LOG.debug("creating; zip file with entries = " + entries);

        // for each entry in the zip file
        for (int entryCounter = 0; entryCounter < entries; entryCounter++) {
            // construct zip entries splitting MAX_LENGTH between entries
            long entryLength = MAX_LENGTH / entries;
            ZipEntry zipEntry = new ZipEntry("/entry" + entryCounter + ".txt");
            zipEntry.setMethod(ZipEntry.DEFLATED);
            zos.putNextEntry(zipEntry);

            for (length = entryCounter * entryLength; length < (entryCounter + 1) * entryLength; length++) {
                zos.write(Long.toString(length).getBytes());
                zos.write("\n".getBytes());
            }

            zos.flush();
            zos.closeEntry();
        }

        zos.flush();
        zos.close();

        currentFs.delete(file, true);

        OutputStream outputStream = currentFs.create(file);

        byteArrayOutputStream.writeTo(outputStream);
        outputStream.close();

        ZipInputFormat format = new ZipInputFormat();
        format.configure(job);
        LongWritable key = new LongWritable();
        Text value = new Text();
        InputSplit[] splits = format.getSplits(job, 100);

        BitSet bits = new BitSet((int) length);
        for (int j = 0; j < splits.length; j++) {
            LOG.debug("split[" + j + "]= " + splits[j]);
            RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[j], job, reporter);

            try {
                int count = 0;

                while (reader.next(key, value)) {
                    int v = Integer.parseInt(value.toString());
                    LOG.debug("read " + v);

                    if (bits.get(v))
                        LOG.warn("conflict with " + v + " in split " + j + " at position " + reader.getPos());

                    assertFalse("key in multiple partitions.", bits.get(v));
                    bits.set(v);
                    count++;
                }

                LOG.debug("splits[" + j + "]=" + splits[j] + " count=" + count);
            } finally {
                reader.close();
            }
        }

        assertEquals("some keys in no partition.", length, bits.cardinality());
    }
}

From source file:cascading.tap.Hfs.java

License:Open Source License

protected FileSystem getDefaultFileSystem(JobConf jobConf) throws IOException {
    return FileSystem.get(jobConf);
}

From source file:cascading.tap.hive.HiveTap.java

License:Open Source License

@Override
public boolean resourceExists(JobConf conf) throws IOException {
    IMetaStoreClient metaStoreClient = null;
    try {/*from   w  ww. j ava  2s .com*/
        metaStoreClient = createMetaStoreClient();
        Table table = metaStoreClient.getTable(tableDescriptor.getDatabaseName(),
                tableDescriptor.getTableName());
        modifiedTime = table.getLastAccessTime();
        // check if the schema matches the table descriptor. If not, throw an exception.
        if (strict) {
            LOG.info("strict mode: comparing existing hive table with table descriptor");
            if (!table.getTableType().equals(tableDescriptor.toHiveTable().getTableType()))
                throw new HiveTableValidationException(
                        String.format("expected a table of type '%s' but found '%s'",
                                tableDescriptor.toHiveTable().getTableType(), table.getTableType()));

            // Check that the paths are the same
            FileSystem fs = FileSystem.get(conf);
            StorageDescriptor sd = table.getSd();
            Path expectedPath = fs.makeQualified(
                    new Path(tableDescriptor.getLocation(hiveConf.getVar(ConfVars.METASTOREWAREHOUSE))));
            Path actualPath = fs.makeQualified(new Path(sd.getLocation()));

            if (!expectedPath.equals(actualPath))
                throw new HiveTableValidationException(
                        String.format("table in MetaStore does not have the sampe path. Expected %s got %s",
                                expectedPath, actualPath));

            List<FieldSchema> schemaList = sd.getCols();
            if (schemaList.size() != tableDescriptor.getColumnNames().length
                    - tableDescriptor.getPartitionKeys().length)
                throw new HiveTableValidationException(String.format(
                        "table in MetaStore does not have same number of columns. expected %d got %d",
                        tableDescriptor.getColumnNames().length - tableDescriptor.getPartitionKeys().length,
                        schemaList.size()));
            for (int index = 0; index < schemaList.size(); index++) {
                FieldSchema schema = schemaList.get(index);
                String expectedColumnName = tableDescriptor.getColumnNames()[index];
                String expectedColumnType = tableDescriptor.getColumnTypes()[index];
                // this could be extended to the StorageDescriptor if necessary.
                if (!schema.getName().equalsIgnoreCase(expectedColumnName))
                    throw new HiveTableValidationException(
                            String.format("hive schema mismatch: expected column name '%s', but found '%s'",
                                    expectedColumnName, schema.getName()));
                if (!schema.getType().equalsIgnoreCase(expectedColumnType))
                    throw new HiveTableValidationException(
                            String.format("hive schema mismatch: expected column type '%s', but found '%s'",
                                    expectedColumnType, schema.getType()));
            }
            List<FieldSchema> schemaPartitions = table.getPartitionKeys();
            if (schemaPartitions.size() != tableDescriptor.getPartitionKeys().length)
                throw new HiveTableValidationException(String.format(
                        "table in MetaStore does not have same number of partition columns. expected %d got %d",
                        tableDescriptor.getPartitionKeys().length, schemaPartitions.size()));
            int offset = tableDescriptor.getColumnNames().length - tableDescriptor.getPartitionKeys().length;
            for (int index = 0; index < schemaPartitions.size(); index++) {
                FieldSchema schema = schemaPartitions.get(index);
                String expectedColumnName = tableDescriptor.getColumnNames()[index + offset];
                String expectedColumnType = tableDescriptor.getColumnTypes()[index + offset];
                // this could be extended to the StorageDescriptor if necessary.
                if (!schema.getName().equalsIgnoreCase(expectedColumnName))
                    throw new HiveTableValidationException(String.format(
                            "hive partition schema mismatch: expected column name '%s', but found '%s'",
                            expectedColumnName, schema.getName()));
                if (!schema.getType().equalsIgnoreCase(expectedColumnType))
                    throw new HiveTableValidationException(String.format(
                            "hive partition schema mismatch: expected column type '%s', but found '%s'",
                            expectedColumnType, schema.getType()));
            }
        }
        return true;
    } catch (MetaException exception) {
        throw new IOException(exception);
    } catch (NoSuchObjectException exception) {
        return false;
    } catch (TException exception) {
        throw new IOException(exception);
    } finally {
        if (metaStoreClient != null)
            metaStoreClient.close();
    }
}

From source file:cc.slda.AnnotateDocuments.java

License:Apache License

/**
 * Runs this tool./*from w ww.  j a v  a  2s  . com*/
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers")
            .create(NUM_REDUCERS));
    options.addOption(OptionBuilder.withArgName(PCUTOFF).hasArg()
            .withDescription("probability of topic assignment").create(PCUTOFF));
    options.addOption(OptionBuilder.withArgName(INDEX).hasArg()
            .withDescription("path to data directory containing term and title indices").create(INDEX));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT) || !cmdline.hasOption(INDEX)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String indexPath = cmdline.getOptionValue(INDEX);
    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);
    int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS))
            : 1;

    float cutoff = 0.9f;
    if (cmdline.hasOption(PCUTOFF)) {
        cutoff = Float.parseFloat(cmdline.getOptionValue(PCUTOFF));
    }
    LOG.info("Tool: " + AnnotateDocuments.class.getSimpleName());
    LOG.info(" - indices path: " + indexPath);
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of reducers: " + reduceTasks);
    LOG.info(" - log(probCutoff): " + Math.log(cutoff));

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    Job job = Job.getInstance(conf);
    job.setJobName(AnnotateDocuments.class.getSimpleName());
    job.setJarByClass(AnnotateDocuments.class);

    String termIndex = indexPath + Path.SEPARATOR + TERM;
    String titleIndex = indexPath + Path.SEPARATOR + TITLE;

    Path termIndexPath = new Path(termIndex);
    Path titleIndexPath = new Path(titleIndex);

    Preconditions.checkArgument(fs.exists(termIndexPath), "Missing term index files... " + termIndexPath);
    DistributedCache.addCacheFile(termIndexPath.toUri(), job.getConfiguration());
    Preconditions.checkArgument(fs.exists(titleIndexPath), "Missing title index files... " + titleIndexPath);
    DistributedCache.addCacheFile(titleIndexPath.toUri(), job.getConfiguration());

    job.setNumReduceTasks(reduceTasks);
    conf.setFloat(PCUTOFF, cutoff);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileInputFormat.setInputPaths(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(HMapSIW.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(HMapSIW.class);

    job.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    Path outputDir = new Path(outputPath);
    FileSystem.get(conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:cc.slda.DisplayTopic.java

License:Apache License

@SuppressWarnings("unchecked")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(Settings.HELP_OPTION, false, "print the help message");
    options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg()
            .withDescription("input beta file").create(Settings.INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName(Settings.PATH_INDICATOR).hasArg()
            .withDescription("term index file").create(ParseCorpus.INDEX));
    options.addOption(OptionBuilder.withArgName(Settings.INTEGER_INDICATOR).hasArg()
            .withDescription("display top terms only (default - 10)").create(TOP_DISPLAY_OPTION));

    String betaString = null;/* ww  w  .  ja v  a 2 s. co m*/
    String indexString = null;
    int topDisplay = TOP_DISPLAY;

    CommandLineParser parser = new GnuParser();
    HelpFormatter formatter = new HelpFormatter();
    try {
        CommandLine line = parser.parse(options, args);

        if (line.hasOption(Settings.HELP_OPTION)) {
            formatter.printHelp(ParseCorpus.class.getName(), options);
            System.exit(0);
        }

        if (line.hasOption(Settings.INPUT_OPTION)) {
            betaString = line.getOptionValue(Settings.INPUT_OPTION);
        } else {
            throw new ParseException("Parsing failed due to " + Settings.INPUT_OPTION + " not initialized...");
        }

        if (line.hasOption(ParseCorpus.INDEX)) {
            indexString = line.getOptionValue(ParseCorpus.INDEX);
        } else {
            throw new ParseException("Parsing failed due to " + ParseCorpus.INDEX + " not initialized...");
        }

        if (line.hasOption(TOP_DISPLAY_OPTION)) {
            topDisplay = Integer.parseInt(line.getOptionValue(TOP_DISPLAY_OPTION));
        }
    } catch (ParseException pe) {
        System.err.println(pe.getMessage());
        formatter.printHelp(ParseCorpus.class.getName(), options);
        System.exit(0);
    } catch (NumberFormatException nfe) {
        System.err.println(nfe.getMessage());
        System.exit(0);
    }

    JobConf conf = new JobConf(DisplayTopic.class);
    FileSystem fs = FileSystem.get(conf);

    Path indexPath = new Path(indexString);
    Preconditions.checkArgument(fs.exists(indexPath) && fs.isFile(indexPath), "Invalid index path...");

    Path betaPath = new Path(betaString);
    Preconditions.checkArgument(fs.exists(betaPath) && fs.isFile(betaPath), "Invalid beta path...");

    SequenceFile.Reader sequenceFileReader = null;
    try {
        IntWritable intWritable = new IntWritable();
        Text text = new Text();
        Map<Integer, String> termIndex = new HashMap<Integer, String>();
        sequenceFileReader = new SequenceFile.Reader(fs, indexPath, conf);
        while (sequenceFileReader.next(intWritable, text)) {
            termIndex.put(intWritable.get(), text.toString());
        }

        PairOfIntFloat pairOfIntFloat = new PairOfIntFloat();
        // HMapIFW hmap = new HMapIFW();
        HMapIDW hmap = new HMapIDW();
        TreeMap<Double, Integer> treeMap = new TreeMap<Double, Integer>();
        sequenceFileReader = new SequenceFile.Reader(fs, betaPath, conf);
        while (sequenceFileReader.next(pairOfIntFloat, hmap)) {
            treeMap.clear();

            System.out.println("==============================");
            System.out.println(
                    "Top ranked " + topDisplay + " terms for Topic " + pairOfIntFloat.getLeftElement());
            System.out.println("==============================");

            Iterator<Integer> itr1 = hmap.keySet().iterator();
            int temp1 = 0;
            while (itr1.hasNext()) {
                temp1 = itr1.next();
                treeMap.put(-hmap.get(temp1), temp1);
                if (treeMap.size() > topDisplay) {
                    treeMap.remove(treeMap.lastKey());
                }
            }

            Iterator<Double> itr2 = treeMap.keySet().iterator();
            double temp2 = 0;
            while (itr2.hasNext()) {
                temp2 = itr2.next();
                if (termIndex.containsKey(treeMap.get(temp2))) {
                    System.out.println(termIndex.get(treeMap.get(temp2)) + "\t\t" + -temp2);
                } else {
                    System.out.println("How embarrassing! Term index not found...");
                }
            }
        }
    } finally {
        IOUtils.closeStream(sequenceFileReader);
    }

    return 0;
}