Example usage for org.apache.hadoop.fs Path makeQualified

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path makeQualified.

Prototype

@Deprecated
public Path makeQualified(FileSystem fs)

Source Link

Document

Returns a qualified path object for the FileSystem 's working directory.

Usage

From source file:com.splout.db.hadoop.StoreDeployerTool.java

License:Apache License

/**
 * Deploy already generated tablespaces at a time.
 *//* w ww.  ja  v a2 s  . c om*/
@SuppressWarnings("unchecked")
public void deploy(Collection<TablespaceDepSpec> deployments) throws JSONSerDeException, IOException {

    // We now query for the alive DNodes and build deployRequests accordingly
    DeployRequest[] deployRequests = new DeployRequest[deployments.size()];

    log.info("Querying Splout QNode for list of DNodes...");
    SploutClient client = new SploutClient(qnode);
    List<String> dnodes = client.dNodeList();
    if (dnodes == null || dnodes.size() == 0) {
        throw new IOException("No available DNodes in Splout cluster.");
    }

    int tIndex = 0;
    for (TablespaceDepSpec tablespace : deployments) {
        Path tablespaceOut = new Path(tablespace.getSourcePath());

        // Define a DeployRequest for this Tablespace
        deployRequests[tIndex] = new DeployRequest();

        // Splout only accepts absolute URIs
        FileSystem sourceFs = tablespaceOut.getFileSystem(conf);
        if (!sourceFs.exists(tablespaceOut)) {
            throw new IllegalArgumentException("Folder doesn't exist: " + tablespaceOut);
        }
        @SuppressWarnings("deprecation")
        Path absoluteOutPath = tablespaceOut.makeQualified(sourceFs);

        Path partitionMapPath = new Path(tablespaceOut, TablespaceGenerator.OUT_PARTITION_MAP);
        if (!sourceFs.exists(partitionMapPath)) {
            throw new IllegalArgumentException(
                    "Invalid tablespace folder: " + tablespaceOut + " doesn't contain a partition-map file.");
        }

        // Load the partition map
        PartitionMap partitionMap = JSONSerDe.deSer(HadoopUtils.fileToString(sourceFs, partitionMapPath),
                PartitionMap.class);

        // Load the init statements, if they exist
        ArrayList<String> initStatements = new ArrayList<String>();
        Path initStatementsPath = new Path(tablespaceOut, TablespaceGenerator.OUT_INIT_STATEMENTS);
        if (sourceFs.exists(initStatementsPath)) {
            initStatements.addAll(
                    JSONSerDe.deSer(HadoopUtils.fileToString(sourceFs, initStatementsPath), ArrayList.class));
        }
        // Add the other initStatements coming in the deploy request
        if (tablespace.getInitStatements() != null) {
            initStatements.addAll(tablespace.getInitStatements());
        }

        String engine = DefaultEngine.class.getName();
        // New : load the engine id used in the generation tool, if exists ( to maintain backwards compatibility )
        Path engineId = new Path(tablespaceOut, TablespaceGenerator.OUT_ENGINE);
        if (sourceFs.exists(engineId)) {
            engine = HadoopUtils.fileToString(sourceFs, engineId);
            log.info("Using generated engine id: " + engine);
        }

        // Finally set
        deployRequests[tIndex].setInitStatements(initStatements);
        deployRequests[tIndex].setEngine(engine);

        deployRequests[tIndex].setTablespace(tablespace.getTablespace());
        deployRequests[tIndex].setData_uri(new Path(absoluteOutPath, "store").toUri().toString());
        deployRequests[tIndex].setPartitionMap(partitionMap.getPartitionEntries());

        // If rep > dnodes, impossible to reach this level of replication
        int repFactor = tablespace.getReplication();
        if (dnodes.size() < repFactor) {
            log.warn(
                    "WARNING: Replication factor " + repFactor + " for tablespace " + tablespace.getTablespace()
                            + " is bigger than the number of serving DNodes. Adjusting replication factor to "
                            + dnodes.size());
            repFactor = dnodes.size();
        }

        deployRequests[tIndex]
                .setReplicationMap(ReplicationMap.roundRobinMap(partitionMap.getPartitionEntries().size(),
                        repFactor, dnodes.toArray(new String[0])).getReplicationEntries());

        tIndex++;
    }

    // Finally we send the deploy request
    DeployInfo dInfo = client.deploy(deployRequests);

    log.info("Deploy request of [" + deployments.size() + "] tablespaces performed. Deploy on [" + qnode
            + "] with version [" + dInfo.getVersion() + "] in progress.");
}

From source file:com.splout.db.integration.HadoopIntegrationTest.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    // Validate params etc
    JCommander jComm = new JCommander(this);
    jComm.setProgramName("Splout Hadoop Compatibility Integration Test");
    try {/*from   w  w  w . ja  v  a 2 s  .co  m*/
        jComm.parse(args);
    } catch (ParameterException e) {
        System.err.println(e.getMessage());
        jComm.usage();
        System.exit(-1);
    }

    Path tmpHdfsPath = new Path(
            "tmp-" + HadoopIntegrationTest.class.getName() + "-" + System.currentTimeMillis());
    FileSystem fS = tmpHdfsPath.getFileSystem(getConf());
    fS.mkdirs(tmpHdfsPath);
    fS.mkdirs(new Path(tmpHdfsPath, "input"));
    fS.mkdirs(new Path(tmpHdfsPath, "output"));
    boolean isLocal = FileSystem.get(conf).equals(FileSystem.getLocal(conf));
    if (!isLocal) {
        SploutHadoopConfiguration.addSQLite4JavaNativeLibsToDC(conf);
    }

    tmpHdfsPath = tmpHdfsPath.makeQualified(fS);

    Path pageCounts = new Path(input);
    FileUtil.copy(FileSystem.getLocal(getConf()), pageCounts, fS, new Path(tmpHdfsPath, "input"), false,
            getConf());

    SimpleGeneratorCMD generator = new SimpleGeneratorCMD();
    generator.setConf(getConf());
    if (generator.run(new String[] { "-tb", "pagecountsintegration", "-t", "pagecounts", "-i",
            tmpHdfsPath + "/input", "-o", tmpHdfsPath + "/output", "-s",
            "projectcode:string, pagename:string, visits:int, bytes:long", "-pby", "projectcode,pagename",
            "-sep", "\" \"", "-p", "2", "-e", engine }) < 0) {
        throw new RuntimeException("Generator failed!");
    }

    SploutClient client = new SploutClient(qnode);
    QNodeStatus status = client.overview();
    long previousVersion = -1;
    if (status.getTablespaceMap().get("pagecountsintegration") != null) {
        previousVersion = status.getTablespaceMap().get("pagecountsintegration").getVersion();
    }

    DeployerCMD deployer = new DeployerCMD();
    deployer.setConf(getConf());
    if (deployer.run(new String[] { "-r", "2", "-q", qnode, "-root", tmpHdfsPath + "/output", "-ts",
            "pagecountsintegration" }) < 0) {
        throw new RuntimeException("Deployer failed!");
    }

    long waitedSoFar = 0;

    status = client.overview();
    while (status.getTablespaceMap().get("pagecountsintegration") == null
            || previousVersion == status.getTablespaceMap().get("pagecountsintegration").getVersion()) {
        Thread.sleep(2000);
        waitedSoFar += 2000;
        status = client.overview();
        if (waitedSoFar > 90000) {
            throw new RuntimeException(
                    "Deploy must have failed in Splout's server. Waiting too much for it to complete.");
        }
    }

    previousVersion = status.getTablespaceMap().get("pagecountsintegration").getVersion();

    QueryStatus qStatus = client.query("pagecountsintegration", "*", "SELECT * FROM pagecounts;", null);
    System.out.println(qStatus.getResult());

    if (qStatus.getResult() == null) {
        throw new RuntimeException("Something failed as query() is returning null!");
    }

    System.out.println("Everything fine.");
    return 1;
}

From source file:com.test.hadoop.JhhSort.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job./*from   www  .  j ava2 s.  c  o m*/
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */
@SuppressWarnings({ "rawtypes" })
public int run(String[] args) throws Exception {

    JobConf jobConf = new JobConf(getConf(), JhhSort.class);

    jobConf.setJobName("sorter");
    jobConf.set("mapred.job.tracker", "192.168.12.200:9001");
    jobConf.set("fs.default.name", "hdfs://192.168.12.200:9000");
    jobConf.setMapperClass(IdentityMapper.class);
    jobConf.setReducerClass(IdentityReducer.class);

    JobClient client = new JobClient(jobConf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.5);
    String sort_reduces = jobConf.get("test.sort.reduces_per_host");
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = TextInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = TextOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = LongWritable.class;
    Class<? extends Writable> outputValueClass = LongWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                jobConf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Set user-supplied (possibly default) job configs
    jobConf.setNumReduceTasks(num_reduces);

    jobConf.setInputFormat(inputFormatClass);
    jobConf.setOutputFormat(outputFormatClass);

    jobConf.setOutputKeyClass(outputKeyClass);
    jobConf.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(jobConf, otherArgs.get(0));
    FileOutputFormat.setOutputPath(jobConf, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        jobConf.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(jobConf)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(jobConf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(jobConf, partitionFile);
        InputSampler.<K, V>writePartitionFile(jobConf, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, jobConf);
        DistributedCache.createSymlink(jobConf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(jobConf)[0] + " into " + FileOutputFormat.getOutputPath(jobConf)
            + " with " + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    jobResult = JobClient.runJob(jobConf);
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");

    return 0;
}

From source file:com.yahoo.storm.yarn.StormOnYarn.java

License:Open Source License

private void launchApp(String appName, String queue, int amMB, String storm_zip_location) throws Exception {
    LOG.debug("StormOnYarn:launchApp() ...");
    YarnClientApplication client_app = _yarn.createApplication();
    GetNewApplicationResponse app = client_app.getNewApplicationResponse();
    _appId = app.getApplicationId();/*from  w w w  . j  a  va 2s.com*/
    LOG.debug("_appId:" + _appId);

    if (amMB > app.getMaximumResourceCapability().getMemory()) {
        //TODO need some sanity checks
        amMB = app.getMaximumResourceCapability().getMemory();
    }
    ApplicationSubmissionContext appContext = Records.newRecord(ApplicationSubmissionContext.class);
    appContext.setApplicationId(app.getApplicationId());
    appContext.setApplicationName(appName);
    appContext.setQueue(queue);

    // Set up the container launch context for the application master
    ContainerLaunchContext amContainer = Records.newRecord(ContainerLaunchContext.class);
    Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();

    // set local resources for the application master
    // local files or archives as needed
    // In this scenario, the jar file for the application master is part of the
    // local resources
    LOG.info("Copy App Master jar from local filesystem and add to local environment");
    // Copy the application master jar to the filesystem
    // Create a local resource to point to the destination jar path
    String appMasterJar = findContainingJar(MasterServer.class);
    FileSystem fs = FileSystem.get(_hadoopConf);
    Path src = new Path(appMasterJar);
    String appHome = Util.getApplicationHomeForId(_appId.toString());
    Path dst = new Path(fs.getHomeDirectory(), appHome + Path.SEPARATOR + "AppMaster.jar");
    fs.copyFromLocalFile(false, true, src, dst);
    localResources.put("AppMaster.jar", Util.newYarnAppResource(fs, dst));

    String stormVersion = Util.getStormVersion();
    Path zip;
    if (storm_zip_location != null) {
        zip = new Path(storm_zip_location);
    } else {
        zip = new Path("/lib/storm/" + stormVersion + "/storm.zip");
    }
    _stormConf.put("storm.zip.path", zip.makeQualified(fs).toUri().getPath());
    LocalResourceVisibility visibility = LocalResourceVisibility.PUBLIC;
    _stormConf.put("storm.zip.visibility", "PUBLIC");
    if (!Util.isPublic(fs, zip)) {
        visibility = LocalResourceVisibility.APPLICATION;
        _stormConf.put("storm.zip.visibility", "APPLICATION");
    }
    localResources.put("storm", Util.newYarnAppResource(fs, zip, LocalResourceType.ARCHIVE, visibility));

    Path confDst = Util.createConfigurationFileInFs(fs, appHome, _stormConf, _hadoopConf);
    // establish a symbolic link to conf directory
    localResources.put("conf", Util.newYarnAppResource(fs, confDst));

    // Setup security tokens
    Path[] paths = new Path[3];
    paths[0] = dst;
    paths[1] = zip;
    paths[2] = confDst;
    Credentials credentials = new Credentials();
    TokenCache.obtainTokensForNamenodes(credentials, paths, _hadoopConf);
    DataOutputBuffer dob = new DataOutputBuffer();
    credentials.writeTokenStorageToStream(dob);
    ByteBuffer securityTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength());

    //security tokens for HDFS distributed cache
    amContainer.setTokens(securityTokens);

    // Set local resource info into app master container launch context
    amContainer.setLocalResources(localResources);

    // Set the env variables to be setup in the env where the application master
    // will be run
    LOG.info("Set the environment for the application master");
    Map<String, String> env = new HashMap<String, String>();
    // add the runtime classpath needed for tests to work
    Apps.addToEnvironment(env, Environment.CLASSPATH.name(), "./conf");
    Apps.addToEnvironment(env, Environment.CLASSPATH.name(), "./AppMaster.jar");

    //Make sure that AppMaster has access to all YARN JARs
    List<String> yarn_classpath_cmd = java.util.Arrays.asList("yarn", "classpath");
    ProcessBuilder pb = new ProcessBuilder(yarn_classpath_cmd).redirectError(Redirect.INHERIT);
    LOG.info("YARN CLASSPATH COMMAND = [" + yarn_classpath_cmd + "]");
    pb.environment().putAll(System.getenv());
    Process proc = pb.start();
    BufferedReader reader = new BufferedReader(new InputStreamReader(proc.getInputStream(), "UTF-8"));
    String line = "";
    String yarn_class_path = (String) _stormConf.get("storm.yarn.yarn_classpath");
    if (yarn_class_path == null) {
        StringBuilder yarn_class_path_builder = new StringBuilder();
        while ((line = reader.readLine()) != null) {
            yarn_class_path_builder.append(line);
        }
        yarn_class_path = yarn_class_path_builder.toString();
    }
    LOG.info("YARN CLASSPATH = [" + yarn_class_path + "]");
    proc.waitFor();
    reader.close();
    Apps.addToEnvironment(env, Environment.CLASSPATH.name(), yarn_class_path);

    String stormHomeInZip = Util.getStormHomeInZip(fs, zip, stormVersion);
    Apps.addToEnvironment(env, Environment.CLASSPATH.name(), "./storm/" + stormHomeInZip + "/*");
    Apps.addToEnvironment(env, Environment.CLASSPATH.name(), "./storm/" + stormHomeInZip + "/lib/*");

    String java_home = (String) _stormConf.get("storm.yarn.java_home");
    if (java_home == null)
        java_home = System.getenv("JAVA_HOME");

    if (java_home != null && !java_home.isEmpty())
        env.put("JAVA_HOME", java_home);
    LOG.info("Using JAVA_HOME = [" + env.get("JAVA_HOME") + "]");

    env.put("appJar", appMasterJar);
    env.put("appName", appName);
    env.put("appId", new Integer(_appId.getId()).toString());
    env.put("STORM_LOG_DIR", ApplicationConstants.LOG_DIR_EXPANSION_VAR);
    amContainer.setEnvironment(env);

    // Set the necessary command to execute the application master
    Vector<String> vargs = new Vector<String>();
    if (java_home != null && !java_home.isEmpty())
        vargs.add(env.get("JAVA_HOME") + "/bin/java");
    else
        vargs.add("java");
    vargs.add("-Dstorm.home=./storm/" + stormHomeInZip + "/");
    vargs.add("-Dlogfile.name=" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/master.log");
    //vargs.add("-verbose:class");
    vargs.add("com.yahoo.storm.yarn.MasterServer");
    vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr");
    vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout");
    // Set java executable command
    LOG.info("Setting up app master command:" + vargs);

    amContainer.setCommands(vargs);

    // Set up resource type requirements
    // For now, only memory is supported so we set memory requirements
    Resource capability = Records.newRecord(Resource.class);
    capability.setMemory(amMB);
    appContext.setResource(capability);
    appContext.setAMContainerSpec(amContainer);

    _yarn.submitApplication(appContext);
}

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public int run(String[] args) throws Exception {
        JobConf conf = JobBuilder.parseInputAndOutput(this, getConf(), args);
        if (conf == null) {
            return -1;
        }/*from  w  ww  .  j  av  a 2  s. co  m*/

        conf.setInputFormat(SequenceFileInputFormat.class);
        conf.setOutputKeyClass(IntWritable.class);
        conf.setOutputFormat(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setCompressOutput(conf, true);
        SequenceFileOutputFormat.setOutputCompressorClass(conf, GzipCodec.class);
        SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK);

        conf.setPartitionerClass(TotalOrderPartitioner.class);

        InputSampler.Sampler<IntWritable, Text> sampler = new InputSampler.RandomSampler<IntWritable, Text>(0.1,
                10000, 10);

        Path input = FileInputFormat.getInputPaths(conf)[0];
        input = input.makeQualified(input.getFileSystem(conf));

        Path partitionFile = new Path(input, "_partitions");
        TotalOrderPartitioner.setPartitionFile(conf, partitionFile);
        InputSampler.writePartitionFile(conf, sampler);

        // Add to DistributedCache
        URI partitionUri = new URI(partitionFile.toString() + "#_partitions");
        DistributedCache.addCacheFile(partitionUri, conf);
        DistributedCache.createSymlink(conf);

        JobClient.runJob(conf);
        return 0;
    }

From source file:de.tudarmstadt.ukp.dkpro.bigdata.io.hadoop.HdfsResource.java

License:Apache License

@SuppressWarnings("deprecation")
HdfsResource(Path path, FileSystem fs) {
    Assert.notNull(path, "a valid path is required");
    Assert.notNull(fs, "non null file system required");

    this.location = path.toString();
    this.fs = fs;
    this.path = path.makeQualified(fs);

    boolean exists = false;

    try {//from w  ww .ja  va 2  s . c  o  m
        exists = fs.exists(path);
    } catch (final Exception ex) {
    }
    this.exists = exists;

    FileStatus status = null;
    try {
        status = fs.getFileStatus(path);
    } catch (final Exception ex) {
    }
    this.status = status;
}

From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java

License:Open Source License

/**
 * Ensures that the given class is in the class path of running jobs.
 * If the jar is not already in the class path, it is added to the
 * DisributedCache of the given job to ensure the associated job will work
 * fine.//  ww  w .  j a  v a  2  s .c om
 * @param conf
 * @param klass
 */
public static void addClassToPath(Configuration conf, Class<?> klass) {
    // Check if we need to add the containing jar to class path
    String klassJar = findContainingJar(klass);
    String shadoopJar = findContainingJar(SpatialSite.class);
    if (klassJar == null || (shadoopJar != null && klassJar.equals(shadoopJar)))
        return;
    Path containingJar = new Path(findContainingJar(klass));
    Path[] existingClassPaths = DistributedCache.getArchiveClassPaths(conf);
    if (existingClassPaths != null) {
        for (Path existingClassPath : existingClassPaths) {
            if (containingJar.getName().equals(existingClassPath.getName()))
                return;
        }
    }
    // The containing jar is a new one and needs to be copied to class path
    try {
        LOG.info("Adding JAR '" + containingJar.getName() + "' to job class path");
        FileSystem defaultFS = FileSystem.get(conf);
        Path libFolder;
        if (existingClassPaths != null && existingClassPaths.length > 0) {
            libFolder = existingClassPaths[0].getParent();
        } else {
            // First jar to be added like this. Create a new lib folder
            do {
                libFolder = new Path("lib_" + (int) (Math.random() * 100000));
            } while (defaultFS.exists(libFolder));
            defaultFS.mkdirs(libFolder);
            defaultFS.deleteOnExit(libFolder);
        }
        defaultFS.copyFromLocalFile(containingJar, libFolder);
        Path jarFullPath = new Path(libFolder, containingJar.getName()).makeQualified(defaultFS);
        jarFullPath = jarFullPath.makeQualified(defaultFS);
        DistributedCache.addArchiveToClassPath(jarFullPath, conf);
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:edu.umn.cs.spatialHadoop.nasa.StockQuadTree.java

License:Open Source License

/**
 * Creates a full spatio-temporal hierarchy for a source folder
 * @throws ParseException /*from w  w w  .ja va2s .  com*/
 * @throws InterruptedException 
 */
public static void directoryIndexer(final OperationsParams params)
        throws IOException, ParseException, InterruptedException {
    Path inputDir = params.getInputPath();
    FileSystem sourceFs = inputDir.getFileSystem(params);
    final Path sourceDir = inputDir.makeQualified(sourceFs);
    Path destDir = params.getOutputPath();
    final FileSystem destFs = destDir.getFileSystem(params);

    TimeRange timeRange = params.get("time") != null ? new TimeRange(params.get("time")) : null;

    // Create daily indexes that do not exist
    final Path dailyIndexDir = new Path(destDir, "daily");
    FileStatus[] mathcingDays = timeRange == null ? sourceFs.listStatus(inputDir)
            : sourceFs.listStatus(inputDir, timeRange);
    final Vector<Path> sourceFiles = new Vector<Path>();
    for (FileStatus matchingDay : mathcingDays) {
        for (FileStatus matchingTile : sourceFs.listStatus(matchingDay.getPath())) {
            sourceFiles.add(matchingTile.getPath());
        }

    }
    // Shuffle the array for better load balancing across threads
    Collections.shuffle(sourceFiles);
    final String datasetName = params.get("dataset");
    Parallel.forEach(sourceFiles.size(), new RunnableRange<Object>() {
        @Override
        public Object run(int i1, int i2) {
            LOG.info("Worker [" + i1 + "," + i2 + ") started");
            for (int i = i1; i < i2; i++) {
                Path sourceFile = sourceFiles.get(i);
                try {
                    Path relativeSourceFile = makeRelative(sourceDir, sourceFile);
                    Path destFilePath = new Path(dailyIndexDir, relativeSourceFile);
                    if (!destFs.exists(destFilePath)) {
                        LOG.info("Worker [" + i1 + "," + i2 + ") indexing: " + sourceFile.getName());
                        Path tmpFile;
                        do {
                            tmpFile = new Path((int) (Math.random() * 1000000) + ".tmp");
                        } while (destFs.exists(tmpFile));
                        tmpFile = tmpFile.makeQualified(destFs);
                        if (datasetName == null)
                            throw new RuntimeException(
                                    "Please provide the name of dataset you would like to index");
                        AggregateQuadTree.build(params, sourceFile, datasetName, tmpFile);
                        synchronized (destFs) {
                            Path destDir = destFilePath.getParent();
                            if (!destFs.exists(destDir))
                                destFs.mkdirs(destDir);
                        }
                        destFs.rename(tmpFile, destFilePath);
                    }
                } catch (IOException e) {
                    throw new RuntimeException("Error building an index for " + sourceFile, e);
                }
            }
            LOG.info("Worker [" + i1 + "," + i2 + ") finished");
            return null;
        }

    });
    LOG.info("Done generating daily indexes");

    // Merge daily indexes into monthly indexes
    Path monthlyIndexDir = new Path(destDir, "monthly");
    final SimpleDateFormat dayFormat = new SimpleDateFormat("yyyy.MM.dd");
    final SimpleDateFormat monthFormat = new SimpleDateFormat("yyyy.MM");
    mergeIndexes(destFs, dailyIndexDir, monthlyIndexDir, dayFormat, monthFormat, params);
    LOG.info("Done generating monthly indexes");

    // Merge daily indexes into monthly indexes
    Path yearlyIndexDir = new Path(destDir, "yearly");
    final SimpleDateFormat yearFormat = new SimpleDateFormat("yyyy");
    mergeIndexes(destFs, monthlyIndexDir, yearlyIndexDir, monthFormat, yearFormat, params);
    LOG.info("Done generating yearly indexes");
}

From source file:edu.umn.cs.spatialHadoop.nasa.StockQuadTree.java

License:Open Source License

/**
 * Merges a set of indexes into larger indexes
 * @param fs//from w  w w.j  ava  2s . co m
 * @param srcIndexDir
 * @param dstIndexDir
 * @param srcFormat
 * @param dstFormat
 * @param params
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 */
private static void mergeIndexes(final FileSystem fs, Path srcIndexDir, Path dstIndexDir,
        SimpleDateFormat srcFormat, SimpleDateFormat dstFormat, final OperationsParams params)
        throws IOException, ParseException, InterruptedException {
    TimeRange timeRange = params.get("time") != null ? new TimeRange(params.get("time")) : null;
    final FileStatus[] sourceIndexes = timeRange == null ? fs.listStatus(srcIndexDir)
            : fs.listStatus(srcIndexDir, timeRange);
    Arrays.sort(sourceIndexes); // Alphabetical sort acts as sort-by-date here

    // Scan the source indexes and merge each consecutive run belonging to the
    // same unit
    int i1 = 0;
    while (i1 < sourceIndexes.length) {
        final String indexToCreate = dstFormat.format(srcFormat.parse(sourceIndexes[i1].getPath().getName()));
        int i2 = i1 + 1;
        // Keep scanning as long as the source index belongs to the same dest index
        while (i2 < sourceIndexes.length && dstFormat
                .format(srcFormat.parse(sourceIndexes[i2].getPath().getName())).equals(indexToCreate))
            i2++;

        // Merge all source indexes in the range [i1, i2) into one dest index

        // Copy i1, i2 to other variables as final to be accessible from threads
        final int firstIndex = i1;
        final int lastIndex = i2;

        final Path destIndex = new Path(dstIndexDir, indexToCreate);

        // For each tile, merge all values in all source indexes
        /*A regular expression to catch the tile identifier of a MODIS grid cell*/
        final Pattern MODISTileID = Pattern.compile("^.*(h\\d\\dv\\d\\d).*$");
        final FileStatus[] tilesInFirstDay = fs.listStatus(sourceIndexes[i1].getPath());
        // Shuffle the array for better load balancing across threads
        Random rand = new Random();
        for (int i = 0; i < tilesInFirstDay.length - 1; i++) {
            // Swap the entry at i with any following entry
            int j = i + rand.nextInt(tilesInFirstDay.length - i - 1);
            FileStatus temp = tilesInFirstDay[i];
            tilesInFirstDay[i] = tilesInFirstDay[j];
            tilesInFirstDay[j] = temp;
        }
        Parallel.forEach(tilesInFirstDay.length, new RunnableRange<Object>() {
            @Override
            public Object run(int i_file1, int i_file2) {
                for (int i_file = i_file1; i_file < i_file2; i_file++) {
                    try {
                        FileStatus tileInFirstDay = tilesInFirstDay[i_file];

                        // Extract tile ID
                        Matcher matcher = MODISTileID.matcher(tileInFirstDay.getPath().getName());
                        if (!matcher.matches()) {
                            LOG.warn("Cannot extract tile id from file " + tileInFirstDay.getPath());
                            continue;
                        }

                        final String tileID = matcher.group(1);
                        Path destIndexFile = new Path(destIndex, tileID);

                        PathFilter tileFilter = new PathFilter() {
                            @Override
                            public boolean accept(Path path) {
                                return path.getName().contains(tileID);
                            }
                        };

                        // Find matching tiles in all source indexes to merge
                        Vector<Path> filesToMerge = new Vector<Path>(lastIndex - firstIndex);
                        filesToMerge.add(tileInFirstDay.getPath());
                        for (int iDailyIndex = firstIndex + 1; iDailyIndex < lastIndex; iDailyIndex++) {
                            FileStatus[] matchedTileFile = fs.listStatus(sourceIndexes[iDailyIndex].getPath(),
                                    tileFilter);
                            if (matchedTileFile.length == 0)
                                LOG.warn("Could not find tile " + tileID + " in dir "
                                        + sourceIndexes[iDailyIndex].getPath());
                            else if (matchedTileFile.length == 1)
                                filesToMerge.add(matchedTileFile[0].getPath());
                        }

                        if (fs.exists(destIndexFile)) {
                            // Destination file already exists
                            // Check the date of the destination and source files to see
                            // whether it needs to be updated or not
                            long destTimestamp = fs.getFileStatus(destIndexFile).getModificationTime();
                            boolean needsUpdate = false;
                            for (Path fileToMerge : filesToMerge) {
                                long sourceTimestamp = fs.getFileStatus(fileToMerge).getModificationTime();
                                if (sourceTimestamp > destTimestamp) {
                                    needsUpdate = true;
                                    break;
                                }
                            }
                            if (!needsUpdate)
                                continue;
                            else
                                LOG.info("Updating file " + destIndexFile.getName());
                        }

                        // Do the merge
                        Path tmpFile;
                        do {
                            tmpFile = new Path((int) (Math.random() * 1000000) + ".tmp");
                        } while (fs.exists(tmpFile));
                        tmpFile = tmpFile.makeQualified(fs);
                        LOG.info("Merging tile " + tileID + " into file " + destIndexFile);
                        AggregateQuadTree.merge(params, filesToMerge.toArray(new Path[filesToMerge.size()]),
                                tmpFile);
                        synchronized (fs) {
                            Path destDir = destIndexFile.getParent();
                            if (!fs.exists(destDir))
                                fs.mkdirs(destDir);
                        }
                        fs.rename(tmpFile, destIndexFile);
                    } catch (IOException e) {
                        e.printStackTrace();
                    }
                }
                return null;
            }
        });
        i1 = i2;
    }
}

From source file:edu.umn.cs.spatialHadoop.nasa.HTTPFileSystem.java

License:Open Source License

/**
 * Lists all files and directories in a given Path that points to a directory.
 * While this function is written in a generic way, it was designed and tested
 * only with LP DAAC archives./*from  ww w .j a v a2  s  .c  om*/
 */
@Override
public FileStatus[] listStatus(Path f) throws IOException {
    Vector<FileStatus> statuses = new Vector<FileStatus>();
    final Pattern httpEntryPattern = Pattern
            .compile("<a href=\"[^\"]+\">(.+)</a>\\s*(\\d+-\\w+-\\d+)\\s+(\\d+:\\d+)\\s+([\\d\\.]+[KMG]|-)");
    f = f.makeQualified(this);
    URL url = f.toUri().toURL();
    int retryCount = HTTPFileSystem.retries;
    BufferedReader inBuffer = null;
    try {
        while (inBuffer == null && retryCount-- > 0) {
            try {
                inBuffer = new BufferedReader(new InputStreamReader(url.openStream()));
            } catch (java.net.SocketException e) {
                if (retryCount == 0)
                    throw e;
                LOG.info("Error accessing file '" + url + "'. Trials left: " + retryCount);
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException e1) {
                }
            } catch (java.net.UnknownHostException e) {
                if (retryCount == 0)
                    throw e;
                LOG.info("Error accessing file '" + url + "'. Trials left: " + retryCount);
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException e1) {
                }
            }
        }
        if (inBuffer == null)
            throw new RuntimeException("Could not access URL " + f);
        String line;
        while ((line = inBuffer.readLine()) != null) {
            Matcher matcher = httpEntryPattern.matcher(line);
            while (matcher.find()) {
                String entryName = matcher.group(1);
                Path entryPath = new Path(f, entryName);

                String entryDate = matcher.group(2);
                String entryTime = matcher.group(3);
                long modificationTime = parseDateTime(entryDate, entryTime);

                String size = matcher.group(4);
                boolean isDir = size.equals("-");
                long length = isDir ? 0 : parseSize(size);

                FileStatus fstatus = new FileStatus(length, isDir, 1, 4096, modificationTime, modificationTime,
                        null, null, null, entryPath);
                statuses.add(fstatus);
            }
        }
    } finally {
        if (inBuffer != null)
            inBuffer.close();
    }

    return statuses.toArray(new FileStatus[statuses.size()]);
}