Example usage for org.apache.hadoop.fs FileSystem getDefaultBlockSize

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getDefaultBlockSize.

Prototype

public long getDefaultBlockSize(Path f)

Source Link

Document

Return the number of bytes that large input files should be optimally be split into to minimize I/O time.

Usage

From source file:com.thinkbiganalytics.nifi.v2.hdfs.AbstractHadoopProcessor.java

License:Apache License

/**
 * Reset Hadoop Configuration and FileSystem based on the supplied configuration resources.
 *
 * @param configResources for configuration
 * @param dir             the target directory
 * @param context         for context, which gives access to the principal
 * @return An HdfsResources object/*from  w w w  .j a v  a2s  .  c o m*/
 * @throws IOException if unable to access HDFS
 */
HdfsResources resetHDFSResources(String configResources, String dir, ProcessContext context)
        throws IOException {
    // org.apache.hadoop.conf.Configuration saves its current thread context class loader to use for threads that it creates
    // later to do I/O. We need this class loader to be the NarClassLoader instead of the magical
    // NarThreadContextClassLoader.
    ClassLoader savedClassLoader = Thread.currentThread().getContextClassLoader();
    Thread.currentThread().setContextClassLoader(this.getClass().getClassLoader());

    try {
        Configuration config = getConfigurationFromResources(configResources);

        // first check for timeout on HDFS connection, because FileSystem has a hard coded 15 minute timeout
        checkHdfsUriForTimeout(config);

        // disable caching of Configuration and FileSystem objects, else we cannot reconfigure the processor without a complete
        // restart
        String disableCacheName = String.format("fs.%s.impl.disable.cache",
                FileSystem.getDefaultUri(config).getScheme());
        config.set(disableCacheName, "true");

        // If kerberos is enabled, create the file system as the kerberos principal
        // -- use RESOURCE_LOCK to guarantee UserGroupInformation is accessed by only a single thread at at time
        FileSystem fs = null;
        UserGroupInformation ugi = null;
        synchronized (RESOURCES_LOCK) {
            if (config.get("hadoop.security.authentication").equalsIgnoreCase("kerberos")) {
                String principal = context.getProperty(kerberosPrincipal).getValue();
                String keyTab = context.getProperty(kerberosKeytab).getValue();
                UserGroupInformation.setConfiguration(config);
                ugi = UserGroupInformation.loginUserFromKeytabAndReturnUGI(principal, keyTab);
                modifyConfig(context, config);
                fs = getFileSystemAsUser(config, ugi);
                lastKerberosReloginTime = System.currentTimeMillis() / 1000;
            } else {
                config.set("ipc.client.fallback-to-simple-auth-allowed", "true");
                config.set("hadoop.security.authentication", "simple");
                modifyConfig(context, config);
                fs = getFileSystem(config);
            }
        }
        getLog().info(
                "Initialized a new HDFS File System with working dir: {} default block size: {} default replication: {} config: {}",
                new Object[] { fs.getWorkingDirectory(), fs.getDefaultBlockSize(new Path(dir)),
                        fs.getDefaultReplication(new Path(dir)), config.toString() });
        return new HdfsResources(config, fs, ugi);
    } finally {
        Thread.currentThread().setContextClassLoader(savedClassLoader);
    }
}

From source file:edu.umn.cs.spatialHadoop.indexing.Indexer.java

License:Open Source License

/**
 * Create a partitioner for a particular job
 * @param ins//from ww  w.ja  v a 2  s.c om
 * @param out
 * @param job
 * @param partitionerName
 * @return
 * @throws IOException
 */
public static Partitioner createPartitioner(Path[] ins, Path out, Configuration job, String partitionerName)
        throws IOException {
    try {
        Partitioner partitioner;
        Class<? extends Partitioner> partitionerClass = PartitionerClasses.get(partitionerName.toLowerCase());
        if (partitionerClass == null) {
            // Try to parse the name as a class name
            try {
                partitionerClass = Class.forName(partitionerName).asSubclass(Partitioner.class);
            } catch (ClassNotFoundException e) {
                throw new RuntimeException("Unknown index type '" + partitionerName + "'");
            }
        }

        if (PartitionerReplicate.containsKey(partitionerName.toLowerCase())) {
            boolean replicate = PartitionerReplicate.get(partitionerName.toLowerCase());
            job.setBoolean("replicate", replicate);
        }
        partitioner = partitionerClass.newInstance();

        long t1 = System.currentTimeMillis();
        final Rectangle inMBR = (Rectangle) OperationsParams.getShape(job, "mbr");
        // Determine number of partitions
        long inSize = 0;
        for (Path in : ins) {
            inSize += FileUtil.getPathSize(in.getFileSystem(job), in);
        }
        long estimatedOutSize = (long) (inSize * (1.0 + job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.1f)));
        FileSystem outFS = out.getFileSystem(job);
        long outBlockSize = outFS.getDefaultBlockSize(out);

        final List<Point> sample = new ArrayList<Point>();
        float sample_ratio = job.getFloat(SpatialSite.SAMPLE_RATIO, 0.01f);
        long sample_size = job.getLong(SpatialSite.SAMPLE_SIZE, 100 * 1024 * 1024);

        LOG.info("Reading a sample of " + (int) Math.round(sample_ratio * 100) + "%");
        ResultCollector<Point> resultCollector = new ResultCollector<Point>() {
            @Override
            public void collect(Point p) {
                sample.add(p.clone());
            }
        };

        OperationsParams params2 = new OperationsParams(job);
        params2.setFloat("ratio", sample_ratio);
        params2.setLong("size", sample_size);
        if (job.get("shape") != null)
            params2.set("shape", job.get("shape"));
        if (job.get("local") != null)
            params2.set("local", job.get("local"));
        params2.setClass("outshape", Point.class, Shape.class);
        Sampler.sample(ins, resultCollector, params2);
        long t2 = System.currentTimeMillis();
        System.out.println("Total time for sampling in millis: " + (t2 - t1));
        LOG.info("Finished reading a sample of " + sample.size() + " records");

        int partitionCapacity = (int) Math.max(1,
                Math.floor((double) sample.size() * outBlockSize / estimatedOutSize));
        int numPartitions = Math.max(1, (int) Math.ceil((float) estimatedOutSize / outBlockSize));
        LOG.info("Partitioning the space into " + numPartitions + " partitions with capacity of "
                + partitionCapacity);

        partitioner.createFromPoints(inMBR, sample.toArray(new Point[sample.size()]), partitionCapacity);

        return partitioner;
    } catch (InstantiationException e) {
        e.printStackTrace();
        return null;
    } catch (IllegalAccessException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.Contains.java

License:Open Source License

public static <S extends Shape> long contains(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Contains.class);

    LOG.info("Contains journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*www .j ava  2  s .c  om*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Within");
    job.setMapperClass(ContainsMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(ContainsReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Crosses.java

License:Open Source License

public static <S extends Shape> long crosses(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Crosses.class);

    LOG.info("Crosses journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from   ww  w . j  a  va  2 s  .  c  om*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Crosses");
    job.setMapperClass(CrossesMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(CrossesReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Disjoint.java

License:Open Source License

public static <S extends Shape> long disjoint(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Disjoint.class);

    LOG.info("Touches journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from  ww w . jav  a 2  s . com*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Disjoint");
    job.setMapperClass(DisjointMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(DisjointReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Equals.java

License:Open Source License

public static <S extends Shape> long equals(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Equals.class);

    LOG.info("Equals journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/* ww w .java 2 s .c o  m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Equals");
    job.setMapperClass(EqualsMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(EqualsReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Indexer.java

License:Open Source License

/***
 * Create a partitioner for a particular job
 * @param in/*from w  w  w  .j a  v  a  2s.c  o m*/
 * @param out
 * @param job
 * @param partitionerName
 * @return
 * @throws IOException
 */
public static Partitioner createPartitioner(Path[] ins, Path out, Configuration job, String partitionerName)
        throws IOException {
    try {
        Partitioner partitioner = null;
        Class<? extends Partitioner> partitionerClass = PartitionerClasses.get(partitionerName.toLowerCase());
        if (partitionerClass == null) {
            // Try to parse the name as a class name
            try {
                partitionerClass = Class.forName(partitionerName).asSubclass(Partitioner.class);
            } catch (ClassNotFoundException e) {
                throw new RuntimeException("Unknown index type '" + partitionerName + "'");
            }
        }

        if (PartitionerReplicate.containsKey(partitionerName.toLowerCase())) {
            boolean replicate = PartitionerReplicate.get(partitionerName.toLowerCase());
            job.setBoolean("replicate", replicate);
        }
        partitioner = partitionerClass.newInstance();

        long t1 = System.currentTimeMillis();
        final Rectangle inMBR = (Rectangle) OperationsParams.getShape(job, "mbr");
        // Determine number of partitions
        long inSize = 0;
        for (Path in : ins) {
            inSize += FileUtil.getPathSize(in.getFileSystem(job), in);
        }
        long estimatedOutSize = (long) (inSize * (1.0 + job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.1f)));
        FileSystem outFS = out.getFileSystem(job);
        long outBlockSize = outFS.getDefaultBlockSize(out);
        int numPartitions = Math.max(1, (int) Math.ceil((float) estimatedOutSize / outBlockSize));
        LOG.info("Partitioning the space into " + numPartitions + " partitions");

        final Vector<Point> sample = new Vector<Point>();
        float sample_ratio = job.getFloat(SpatialSite.SAMPLE_RATIO, 0.01f);
        long sample_size = job.getLong(SpatialSite.SAMPLE_SIZE, 100 * 1024 * 1024);

        LOG.info("Reading a sample of " + (int) Math.round(sample_ratio * 100) + "%");
        ResultCollector<Point> resultCollector = new ResultCollector<Point>() {
            @Override
            public void collect(Point p) {
                sample.add(p.clone());
            }
        };
        OperationsParams params2 = new OperationsParams(job);
        params2.setFloat("ratio", sample_ratio);
        params2.setLong("size", sample_size);
        params2.setClass("outshape", Point.class, Shape.class);
        Sampler.sample(ins, resultCollector, params2);
        long t2 = System.currentTimeMillis();
        System.out.println("Total time for sampling in millis: " + (t2 - t1));
        LOG.info("Finished reading a sample of " + sample.size() + " records");

        partitioner.createFromPoints(inMBR, sample.toArray(new Point[sample.size()]), numPartitions);

        return partitioner;
    } catch (InstantiationException e) {
        e.printStackTrace();
        return null;
    } catch (IllegalAccessException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.Intersects.java

License:Open Source License

public static <S extends Shape> long intersects(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Intersects.class);

    LOG.info("Intersects journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {//from w  w w . j av  a 2 s .c om
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Intersects");
    job.setMapperClass(IntersectsMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(IntersectsReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Overlaps.java

License:Open Source License

public static <S extends Shape> long overlaps(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Overlaps.class);

    LOG.info("Overlaps journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {/*from  ww w.  ja v a  2 s  .  c o  m*/
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Overlaps");
    job.setMapperClass(OverlapMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(OverlapReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Repartition.java

License:Open Source License

/**
 * Calculates number of partitions required to index the given file.
 * @param conf The current configuration which can contain user-defined parameters
 * @param inFileSize The size of the input file in bytes
 * @param outFs The output file system where the index will be written
 * @param outFile The path of the output file which is used to get the output block size.
 * @param blockSize If set, this will override the default output block size.
 * @return The number of blocks needed to write the index file
 *//*from  ww  w  .  j  av  a2  s. co m*/
public static int calculateNumberOfPartitions(Configuration conf, long inFileSize, FileSystem outFs,
        Path outFile, long blockSize) {
    final float IndexingOverhead = conf.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.1f);
    long indexedFileSize = (long) (inFileSize * (1 + IndexingOverhead));
    if (blockSize == 0)
        blockSize = outFs.getDefaultBlockSize(outFile);
    return (int) Math.ceil((float) indexedFileSize / blockSize);
}