Example usage for org.apache.hadoop.mapred LineRecordReader createKey

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred LineRecordReader createKey.

Prototype

public LongWritable createKey()

Source Link

Usage

From source file:edu.umn.cs.spatialHadoop.mapred.SpatialInputFormat.java

License:Open Source License

protected void listStatus(final FileSystem fs, Path dir, final List<FileStatus> result, BlockFilter filter)
        throws IOException {
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, dir);
    if (gindex == null) {
        FileStatus[] listStatus;//w w  w .  j  a  va  2s.co m
        if (OperationsParams.isWildcard(dir)) {
            // Wild card
            listStatus = fs.globStatus(dir);
        } else {
            listStatus = fs.listStatus(dir, SpatialSite.NonHiddenFileFilter);
        }
        // Add all files under this directory
        for (FileStatus status : listStatus) {
            if (status.isDir()) {
                listStatus(fs, status.getPath(), result, filter);
            } else if (status.getPath().getName().toLowerCase().endsWith(".list")) {
                LineRecordReader in = new LineRecordReader(fs.open(status.getPath()), 0, status.getLen(),
                        Integer.MAX_VALUE);
                LongWritable key = in.createKey();
                Text value = in.createValue();
                while (in.next(key, value)) {
                    result.add(fs.getFileStatus(new Path(status.getPath().getParent(), value.toString())));
                }
                in.close();
            } else {
                result.add(status);
            }
        }
    } else {
        final Path indexDir = OperationsParams.isWildcard(dir) ? dir.getParent() : dir;
        // Use the global index to limit files
        filter.selectCells(gindex, new ResultCollector<Partition>() {
            @Override
            public void collect(Partition partition) {
                try {
                    Path cell_path = new Path(indexDir, partition.filename);
                    if (!fs.exists(cell_path))
                        LOG.warn("Matched file not found: " + cell_path);
                    result.add(fs.getFileStatus(cell_path));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        });
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.CatUnion.java

License:Open Source License

/**
 * Read all categories from the category file
 * @param categoryFile/*from  w ww. j ava 2s .  co  m*/
 * @param categoryShapes
 * @param idToCategory
 * @throws IOException
 */
private static void readCategories(Path categoryFile, Map<Integer, Integer> idToCategory) throws IOException {
    Map<Integer, String> idToCatName = new HashMap<Integer, String>();
    FileSystem fsCategory = FileSystem.getLocal(new Configuration());
    long categoryFileSize = fsCategory.getFileStatus(categoryFile).getLen();
    if (categoryFileSize > 1024 * 1024)
        LOG.warn("Category file size is big: " + categoryFileSize);
    InputStream inCategory = fsCategory.open(categoryFile);
    LineRecordReader lineReader = new LineRecordReader(inCategory, 0, categoryFileSize, new Configuration());
    LongWritable lineOffset = lineReader.createKey();
    Text line = lineReader.createValue();

    Set<String> catNames = new TreeSet<String>();

    while (lineReader.next(lineOffset, line)) {
        int shape_id = TextSerializerHelper.consumeInt(line, ',');
        String cat_name = line.toString();
        catNames.add(cat_name);
        idToCatName.put(shape_id, cat_name);
    }

    lineReader.close();

    // Change category names to numbers
    Map<String, Integer> cat_name_to_id = new HashMap<String, Integer>();
    int cat_id = 0;
    for (String cat_name : catNames) {
        cat_name_to_id.put(cat_name, cat_id++);
    }

    for (Map.Entry<Integer, String> entry : idToCatName.entrySet()) {
        idToCategory.put(entry.getKey(), cat_name_to_id.get(entry.getValue()));
    }
}

From source file:mlbench.pagerank.PagerankMerge.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public static void main(String[] args) throws IOException, InterruptedException {
    try {/*from w w w. j  a v  a 2  s  . co  m*/
        parseArgs(args);
        HashMap<String, String> conf = new HashMap<String, String>();
        initConf(conf);
        MPI_D.Init(args, MPI_D.Mode.Common, conf);

        JobConf jobConf = new JobConf(confPath);
        if (MPI_D.COMM_BIPARTITE_O != null) {
            // O communicator
            int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
            int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
            if (rank == 0) {
                LOG.info(PagerankMerge.class.getSimpleName() + " O start.");
            }
            FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, jobConf,
                    inDir, rank);
            for (int i = 0; i < inputs.length; i++) {
                FileSplit fsplit = inputs[i];
                LineRecordReader kvrr = new LineRecordReader(jobConf, fsplit);

                LongWritable key = kvrr.createKey();
                Text value = kvrr.createValue();
                {
                    while (kvrr.next(key, value)) {
                        String line_text = value.toString();
                        final String[] line = line_text.split("\t");
                        if (line.length >= 2) {
                            MPI_D.Send(new IntWritable(Integer.parseInt(line[0])), new Text(line[1]));
                        }
                    }
                }
            }

        } else if (MPI_D.COMM_BIPARTITE_A != null) {
            // A communicator
            int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A);
            if (rank == 0) {
                LOG.info(PagerankMerge.class.getSimpleName() + " A start.");
            }
            HadoopWriter<IntWritable, Text> outrw = HadoopIOUtil.getNewWriter(jobConf, outDir,
                    IntWritable.class, Text.class, TextOutputFormat.class, null, rank, MPI_D.COMM_BIPARTITE_A);

            IntWritable oldKey = null;
            double next_rank = 0;
            double previous_rank = 0;
            double diff = 0;
            int local_diffs = 0;
            random_coeff = (1 - mixing_c) / (double) number_nodes;
            converge_threshold = ((double) 1.0 / (double) number_nodes) / 10;
            Object[] keyValue = MPI_D.Recv();
            while (keyValue != null) {
                IntWritable key = (IntWritable) keyValue[0];
                Text value = (Text) keyValue[1];
                if (oldKey == null) {
                    oldKey = key;
                }
                if (!key.equals(oldKey)) {
                    next_rank = next_rank * mixing_c + random_coeff;
                    outrw.write(oldKey, new Text("v" + next_rank));
                    diff = Math.abs(previous_rank - next_rank);
                    if (diff > converge_threshold) {
                        local_diffs += 1;
                    }
                    oldKey = key;
                    next_rank = 0;
                    previous_rank = 0;
                }

                String cur_value_str = value.toString();
                if (cur_value_str.charAt(0) == 's') {
                    previous_rank = Double.parseDouble(cur_value_str.substring(1));
                } else {
                    next_rank += Double.parseDouble(cur_value_str.substring(1));
                }

                keyValue = MPI_D.Recv();
            }
            if (previous_rank != 0) {
                next_rank = next_rank * mixing_c + random_coeff;
                outrw.write(oldKey, new Text("v" + next_rank));
                diff = Math.abs(previous_rank - next_rank);
                if (diff > converge_threshold)
                    local_diffs += 1;
            }
            outrw.close();
            reduceDiffs(local_diffs, rank);
        }

        MPI_D.Finalize();
    } catch (MPI_D_Exception e) {
        e.printStackTrace();
    }
}

From source file:mlbench.pagerank.PagerankNaive.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public static void main(String[] args) throws IOException, InterruptedException {
    try {/*from   ww w .  j  ava  2s .  c  o  m*/
        parseArgs(args);
        HashMap<String, String> conf = new HashMap<String, String>();
        initConf(conf);
        MPI_D.Init(args, MPI_D.Mode.Common, conf);

        JobConf jobConf = new JobConf(confPath);
        if (MPI_D.COMM_BIPARTITE_O != null) {
            // O communicator
            int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
            int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
            if (rank == 0) {
                LOG.info(PagerankNaive.class.getSimpleName() + " O start.");
            }
            FileSplit[] inputs1 = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O,
                    jobConf, edgeDir, rank);
            FileSplit[] inputs2 = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O,
                    jobConf, vecDir, rank);
            FileSplit[] inputs = (FileSplit[]) ArrayUtils.addAll(inputs2, inputs1);
            for (int i = 0; i < inputs.length; i++) {
                FileSplit fsplit = inputs[i];
                LineRecordReader kvrr = new LineRecordReader(jobConf, fsplit);

                LongWritable key = kvrr.createKey();
                Text value = kvrr.createValue();
                {
                    IntWritable k = new IntWritable();
                    Text v = new Text();
                    while (kvrr.next(key, value)) {
                        String line_text = value.toString();
                        // ignore comments in edge file
                        if (line_text.startsWith("#"))
                            continue;

                        final String[] line = line_text.split("\t");
                        if (line.length < 2)
                            continue;

                        // vector : ROWID VALUE('vNNNN')
                        if (line[1].charAt(0) == 'v') {
                            k.set(Integer.parseInt(line[0]));
                            v.set(line[1]);
                            MPI_D.Send(k, v);
                        } else {
                            /*
                             * In other matrix-vector multiplication, we
                            * output (dst, src) here However, In PageRank,
                            * the matrix-vector computation formula is M^T
                            * * v. Therefore, we output (src,dst) here.
                            */
                            int src_id = Integer.parseInt(line[0]);
                            int dst_id = Integer.parseInt(line[1]);
                            k.set(src_id);
                            v.set(line[1]);
                            MPI_D.Send(k, v);

                            if (make_symmetric == 1) {
                                k.set(dst_id);
                                v.set(line[0]);
                                MPI_D.Send(k, v);
                            }
                        }
                    }
                }
            }

        } else if (MPI_D.COMM_BIPARTITE_A != null) {
            // A communicator
            int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A);
            if (rank == 0) {
                LOG.info(PagerankNaive.class.getSimpleName() + " A start.");
            }

            HadoopWriter<IntWritable, Text> outrw = HadoopIOUtil.getNewWriter(jobConf, outDir,
                    IntWritable.class, Text.class, TextOutputFormat.class, null, rank, MPI_D.COMM_BIPARTITE_A);

            IntWritable oldKey = null;
            int i;
            double cur_rank = 0;
            ArrayList<Integer> dst_nodes_list = new ArrayList<Integer>();
            Object[] keyValue = MPI_D.Recv();
            while (keyValue != null) {
                IntWritable key = (IntWritable) keyValue[0];
                Text value = (Text) keyValue[1];
                if (oldKey == null) {
                    oldKey = key;
                }
                // A new key arrives
                if (!key.equals(oldKey)) {
                    outrw.write(oldKey, new Text("s" + cur_rank));
                    int outdeg = dst_nodes_list.size();
                    if (outdeg > 0) {
                        cur_rank = cur_rank / (double) outdeg;
                    }
                    for (i = 0; i < outdeg; i++) {
                        outrw.write(new IntWritable(dst_nodes_list.get(i)), new Text("v" + cur_rank));
                    }
                    oldKey = key;
                    cur_rank = 0;
                    dst_nodes_list = new ArrayList<Integer>();
                }
                // common record
                String line_text = value.toString();
                final String[] line = line_text.split("\t");
                if (line.length == 1) {
                    if (line_text.charAt(0) == 'v') { // vector : VALUE
                        cur_rank = Double.parseDouble(line_text.substring(1));
                    } else { // edge : ROWID
                        dst_nodes_list.add(Integer.parseInt(line[0]));
                    }
                }
                keyValue = MPI_D.Recv();
            }
            // write the left part
            if (cur_rank != 0) {
                outrw.write(oldKey, new Text("s" + cur_rank));
                int outdeg = dst_nodes_list.size();
                if (outdeg > 0) {
                    cur_rank = cur_rank / (double) outdeg;
                }
                for (i = 0; i < outdeg; i++) {
                    outrw.write(new IntWritable(dst_nodes_list.get(i)), new Text("v" + cur_rank));
                }
            }
            outrw.close();
        }
        MPI_D.Finalize();
    } catch (MPI_D_Exception e) {
        e.printStackTrace();
    }
}