Example usage for org.apache.hadoop.fs FileSystem open

List of usage examples for org.apache.hadoop.fs FileSystem open

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem open.

Prototype

public FSDataInputStream open(PathHandle fd) throws IOException 

Source Link

Document

Open an FSDataInputStream matching the PathHandle instance.

Usage

From source file:com.cg.mapreduce.fpgrowth.mahout.fpm.FPGrowthDriver.java

License:Apache License

private static void runFPGrowth(Parameters params) throws IOException {
    log.info("Starting Sequential FPGrowth");
    int maxHeapSize = Integer.valueOf(params.get("maxHeapSize", "50"));
    int minSupport = Integer.valueOf(params.get("minSupport", "3"));

    Path output = new Path(params.get("output", "output.txt"));
    Path input = new Path(params.get("input"));

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(output.toUri(), conf);

    Charset encoding = Charset.forName(params.get("encoding"));

    String pattern = params.get("splitPattern", PFPGrowth.SPLITTER.toString());

    SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, output, Text.class,
            TopKStringPatterns.class);

    FSDataInputStream inputStream = null;
    FSDataInputStream inputStreamAgain = null;

    Collection<String> features = Sets.newHashSet();

    if ("true".equals(params.get(PFPGrowth.USE_FPG2))) {
        com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth2.FPGrowthObj<String> fp = new com.cg.mapreduce.fpgrowth.mahout.fpm.fpgrowth2.FPGrowthObj<String>();

        try {//from ww w.  j a  v  a2  s .c o  m
            inputStream = fs.open(input);
            inputStreamAgain = fs.open(input);
            fp.generateTopKFrequentPatterns(
                    new StringRecordIterator(new FileLineIterable(inputStream, encoding, false), pattern),
                    fp.generateFList(new StringRecordIterator(
                            new FileLineIterable(inputStreamAgain, encoding, false), pattern), minSupport),
                    minSupport, maxHeapSize, features,
                    new StringOutputConverter(
                            new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)),
                    new ContextStatusUpdater(null));
        } finally {
            Closeables.close(writer, false);
            Closeables.close(inputStream, true);
            Closeables.close(inputStreamAgain, true);
        }
    } else {
        FPGrowth<String> fp = new FPGrowth<String>();

        inputStream = fs.open(input);
        inputStreamAgain = fs.open(input);
        try {
            fp.generateTopKFrequentPatterns(
                    new StringRecordIterator(new FileLineIterable(inputStream, encoding, false), pattern),
                    fp.generateFList(new StringRecordIterator(
                            new FileLineIterable(inputStreamAgain, encoding, false), pattern), minSupport),
                    minSupport, maxHeapSize, features,
                    new StringOutputConverter(
                            new SequenceFileOutputCollector<Text, TopKStringPatterns>(writer)),
                    new ContextStatusUpdater(null));
        } finally {
            Closeables.close(writer, false);
            Closeables.close(inputStream, true);
            Closeables.close(inputStreamAgain, true);
        }
    }

    List<Pair<String, TopKStringPatterns>> frequentPatterns = FPGrowth.readFrequentPattern(conf, output);
    for (Pair<String, TopKStringPatterns> entry : frequentPatterns) {
        log.info("Dumping Patterns for Feature: {} \n{}", entry.getFirst(), entry.getSecond());
    }
}

From source file:com.chinamobile.bcbsp.bspstaff.BSPStaff.java

License:Apache License

private void readMigratePartition(StaffSSControllerInterface sssc, int currentSuperStepCounter)
        throws IOException {
    BufferedReader br = null;/*w ww  .  ja v  a  2  s . c om*/
    Path migratePartitionPath = new Path(migratePartitionDir);
    FileSystem fsFileSystem = FileSystem.get(this.getConf().getConf());
    FileStatus[] fs = fsFileSystem.listStatus(migratePartitionPath);
    Path[] listPath = FileUtil.stat2Paths(fs);
    for (Path p : listPath) {
        FSDataInputStream fsInput = fsFileSystem.open(p);
        br = new BufferedReader(new InputStreamReader(fsInput));
        String line = null;
        while (null != (line = br.readLine())) {
            String[] strs = line.split(":");
            this.partitioner.updateMigratePartition(new Text(strs[0]), Integer.parseInt(strs[1]));
        }
    }
}

From source file:com.chinnu.churndetection.fuzzykmeans.FuzzyKMeansReducer.java

@Override
protected void reduce(IntWritable key, Iterable<Vector> values,
        Reducer<IntWritable, Vector, IntWritable, Text>.Context context)
        throws IOException, InterruptedException {

    double[] sum = new double[DATALENGTH];
    for (int i = 0; i < DATALENGTH; i++) {
        sum[i] = 0;//from   www .  j a  v  a 2 s  .c  o m
    }

    int count = 0;
    for (Vector vector : values) {

        for (int i = 0; i < DATALENGTH; i++) {
            sum[i] += vector.getData()[i];
        }
        count++;

        Text text = new Text(vector.toString());
        context.write(key, text);
    }

    double[] newCenter = new double[DATALENGTH];
    for (int i = 0; i < DATALENGTH; i++) {
        newCenter[i] = sum[i] / count;
    }

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    List<double[]> curr_center = new ArrayList<>();

    String[] lineSplit = CURR_CENTER.split("\n");
    for (int j = 0; j < lineSplit.length; j++) {
        String line = lineSplit[j];
        String[] split = line.split(",");
        double[] temp = new double[split.length];
        for (int i = 0; i < split.length; i++) {
            temp[i] = Double.parseDouble(split[i]);
        }
        curr_center.add(temp);
    }

    List<String> appendLine = new ArrayList<>();
    if (fs.exists(new Path(NEW_CENTER))) {
        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(NEW_CENTER))));

        String line;
        while ((line = br.readLine()) != null) {
            appendLine.add(line);
        }
    }

    PrintWriter pw = new PrintWriter(new OutputStreamWriter(fs.create(new Path(NEW_CENTER), true)));
    for (String string : appendLine) {
        pw.println(string);
        pw.flush();
    }

    String line = "";
    for (int i = 0; i < DATALENGTH; i++) {
        line += newCenter[i] + ",";
    }
    String substring = line.substring(0, line.length() - 1);

    pw.println(substring);
    pw.flush();
    pw.close();

    MRLogger.Log(context.getJobName());
    MRLogger.Log(Arrays.toString(curr_center.get(key.get())));
    MRLogger.Log(Arrays.toString(newCenter));

    double curr_Distance = DistanceComparator.findDistance(curr_center.get(key.get()), newCenter);
    MRLogger.Log(curr_Distance + "");

    if (curr_Distance < 0.01) {
        PrintWriter pw1 = new PrintWriter(
                new OutputStreamWriter(fs.create(new Path(ChurnDriver.CENTER_CONVERGED), true)));
        pw1.println("converged");
        pw1.flush();
        pw1.close();
    }

}

From source file:com.cip.crane.agent.utils.TaskHelper.java

License:Open Source License

@SuppressWarnings("unused")
private void readFileFromHdfs(String srcFile, String destFile) throws IOException, FileNotFoundException {
    File file = new File(destFile);
    if (file.exists()) {
        file.delete();/*from   w  ww .j  ava 2s  .  c o  m*/
    }
    byte[] buf = new byte[BUFFER_SIZE];
    FileOutputStream fos = new FileOutputStream(file);
    FileSystem fs;
    FSDataInputStream hdfsInput;
    try {
        fs = FileSystem.get(URI.create(srcFile), conf);
        hdfsInput = fs.open(new Path(srcFile));
        int num = hdfsInput.read(buf);
        while (num != (-1)) {// ?
            fos.write(buf, 0, num);// ?
            fos.flush();// ?
            num = hdfsInput.read(buf);// ??
        }
        hdfsInput.close();
        fos.close();
        fs.close();
    } catch (IOException e) {
        if (file.exists()) {
            file.delete();
        }
        throw e;
    }
}

From source file:com.ckelsel.hadoop.dfs.Test.Test.java

License:Open Source License

public static void main(String[] args) throws Exception {
    String uri = "hdfs://localhost:9000/";
    Configuration config = new Configuration();
    FileSystem fs = FileSystem.get(URI.create(uri), config);

    // hdfs/user/ckelsel/
    FileStatus[] statuses = fs.listStatus(new Path("/user/ckelsel"));
    for (FileStatus status : statuses) {
        System.out.println(status);
    }//from   www.j  a  v a  2 s.com

    // hdfs/user/ckelsel
    FSDataOutputStream os = fs.create(new Path("/user/ckelsel/test.log"));
    os.write("Hello World!".getBytes());
    os.flush();
    os.close();

    // hdfs/user/ckelsel
    InputStream is = fs.open(new Path("/user/ckelsel/test.log"));
    IOUtils.copyBytes(is, System.out, 1024, true);
}

From source file:com.cloudera.bigdata.analysis.dataload.mapreduce.SplitableRecordReader.java

License:Apache License

/**
 * Decide the start of the reader.// ww  w. j  a va 2  s. com
 */
public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // if (codec instanceof CryptoCodec && job instanceof JobConf)
    // CryptoContextHelper.resetInputCryptoContext((CryptoCodec) codec,
    // (JobConf) job, file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (isCompressedInput()) {
        decompressor = CodecPool.getDecompressor(codec);
        if (codec instanceof SplittableCompressionCodec) {
            final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(
                    fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(cIn, job);
            } else {
                in = new LineReader(cIn, job, this.recordDelimiterBytes);
            }
            start = cIn.getAdjustedStart();
            end = cIn.getAdjustedEnd();
            filePosition = cIn;
        } else {
            if (null == this.recordDelimiterBytes) {
                in = new LineReader(codec.createInputStream(fileIn), job);
            } else {
                in = new LineReader(codec.createInputStream(fileIn), job, this.recordDelimiterBytes);
            }
            filePosition = fileIn;
        }
    } else {
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }
        filePosition = fileIn;
    }
    LOG.info("Read from " + split.getPath().toString());
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
        start += in.readLine(new Text(), 0, maxBytesToConsume(start));

        // Read another line as previous.

        Text current = new Text();

        int newSize = in.readLine(previous, maxLineLength, maxBytesToConsume(start));

        LOG.info("Skip line " + previous + " for last split.");

        start += newSize;

        // Keep reading until a splitable point is found.
        while (start <= end) {
            newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start));
            if (canSplit(previous.getBytes(), current.getBytes())) {
                break;
            }
            start += newSize;
            previous.set(current.getBytes());
            LOG.info("Skip line " + previous + " for last split.");
        }

        // If exceed the end, still read one extra line.
        if (start > end) {
            if (isContinue) {
                newSize = in.readLine(current, maxLineLength, maxBytesToConsume(start));
                if (!canSplit(previous.getBytes(), current.getBytes())) {
                    // Still not splitable. So skip the block.
                    start += newSize;
                    isContinue = false;
                }
            }
        }
        LOG.info("Split between: \n" + previous + "\n" + current);

        // Restart at the last read line.
        fileIn.seek(start);
        if (null == this.recordDelimiterBytes) {
            in = new LineReader(fileIn, job);
        } else {
            in = new LineReader(fileIn, job, this.recordDelimiterBytes);
        }

        this.pos = start;
    } else {
        Text skip = new Text();
        start += in.readLine(skip, maxLineLength, maxBytesToConsume(start));
        // start += in.readLine(skip, 0, maxBytesToConsume(start));
        LOG.info("Skip line " + skip + ". Start at " + start);
    }

    // Restart at the start index.
}

From source file:com.cloudera.ByteBufferRecordReader.java

License:Apache License

private void initialize(Configuration job, long splitStart, long splitLength, Path file) throws IOException {
    start = splitStart;//from w ww .j  a v a 2s .c  o  m
    end = start + splitLength;
    pos = start;

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);
    fileIn = fs.open(file);

    this.readStats = new ReadStatistics();
    this.bufferPool = new ElasticByteBufferPool();
    boolean skipChecksums = job.getBoolean("bytecount.skipChecksums", false);
    this.readOption = skipChecksums ? EnumSet.of(ReadOption.SKIP_CHECKSUMS) : EnumSet.noneOf(ReadOption.class);

    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
    if (null != codec) {
        isCompressedInput = true;
        decompressor = CodecPool.getDecompressor(codec);
        CompressionInputStream cIn = codec.createInputStream(fileIn, decompressor);
        filePosition = cIn;
        inputStream = cIn;
        LOG.info("Compressed input; cannot compute number of records in the split");
    } else {
        fileIn.seek(start);
        filePosition = fileIn;
        inputStream = fileIn;
        LOG.info("Split pos = " + start + " length " + splitLength);
    }
}

From source file:com.cloudera.cdk.morphline.hadoop.rcfile.ReadRCFileTest.java

License:Apache License

private InputStream readPath(final Path inputFile) throws IOException {
    FileSystem fs = inputFile.getFileSystem(new Configuration());
    return fs.open(inputFile);
}

From source file:com.cloudera.cdk.tools.JobClasspathHelper.java

License:Apache License

/**
 * /*from  w w  w .j  a v a 2s. com*/
 * @param conf
 *            Configuration object for the Job. Used to get the FileSystem associated with it.
 * @param libDir
 *            Destination directory in the FileSystem (Usually HDFS) where to upload and look for the libs.
 * @param classesToInclude
 *            Classes that are needed by the job. JarFinder will look for the jar containing these classes.
 * @throws Exception
 */
public void prepareClasspath(final Configuration conf, final Path libDir, Class<?>... classesToInclude)
        throws Exception {
    FileSystem fs = null;
    List<Class<?>> classList = new ArrayList<Class<?>>(Arrays.asList(classesToInclude));
    fs = FileSystem.get(conf);
    Map<String, String> jarMd5Map = new TreeMap<String, String>();
    // for each classes we use JarFinder to locate the jar in the local classpath.
    for (Class<?> clz : classList) {
        if (clz != null) {
            String localJarPath = JarFinder.getJar(clz);
            // we don't want to upload the same jar twice
            if (!jarMd5Map.containsKey(localJarPath)) {
                // We should not push core Hadoop classes with this tool.
                // Should it be the responsibility of the developer or we let
                // this fence here?
                if (!clz.getName().startsWith("org.apache.hadoop.")) {
                    // we compute the MD5 sum of the local jar
                    InputStream in = new FileInputStream(localJarPath);
                    boolean threw = true;
                    try {
                        String md5sum = DigestUtils.md5Hex(in);
                        jarMd5Map.put(localJarPath, md5sum);
                        threw = false;
                    } finally {
                        Closeables.close(in, threw);
                    }
                } else {
                    logger.info("Ignoring {}, since it looks like it's from Hadoop's core libs", localJarPath);
                }
            }
        }
    }

    for (Entry<String, String> entry : jarMd5Map.entrySet()) {
        Path localJarPath = new Path(entry.getKey());
        String jarFilename = localJarPath.getName();
        String localMd5sum = entry.getValue();
        logger.info("Jar {}. MD5 : [{}]", localJarPath, localMd5sum);

        Path remoteJarPath = new Path(libDir, jarFilename);
        Path remoteMd5Path = new Path(libDir, jarFilename + ".md5");

        // If the jar file does not exist in HDFS or if the MD5 file does not exist in HDFS,
        // we force the upload of the jar.
        if (!fs.exists(remoteJarPath) || !fs.exists(remoteMd5Path)) {
            copyJarToHDFS(fs, localJarPath, localMd5sum, remoteJarPath, remoteMd5Path);
        } else {
            // If the jar exist,we validate the MD5 file.
            // If the MD5 sum is different, we upload the jar
            FSDataInputStream md5FileStream = null;

            String remoteMd5sum = "";
            try {
                md5FileStream = fs.open(remoteMd5Path);
                byte[] md5bytes = new byte[32];
                if (32 == md5FileStream.read(md5bytes)) {
                    remoteMd5sum = new String(md5bytes, Charsets.UTF_8);
                }
            } finally {
                if (md5FileStream != null) {
                    md5FileStream.close();
                }
            }

            if (localMd5sum.equals(remoteMd5sum)) {
                logger.info("Jar {} already exists [{}] and md5sum are equals", jarFilename,
                        remoteJarPath.toUri().toASCIIString());
            } else {
                logger.info("Jar {} already exists [{}] and md5sum are different!", jarFilename,
                        remoteJarPath.toUri().toASCIIString());
                copyJarToHDFS(fs, localJarPath, localMd5sum, remoteJarPath, remoteMd5Path);
            }

        }
        // In all case we want to add the jar to the DistributedCache's classpath
        DistributedCache.addFileToClassPath(remoteJarPath, conf, fs);
    }
    // and we create the symlink (was necessary in earlier versions of Hadoop)
    DistributedCache.createSymlink(conf);
}

From source file:com.cloudera.circus.test.TestXTest.java

License:Open Source License

@Test
@TestHadoop//from  w  ww  .  j ava2  s . co m
public void testHadoopFileSystem() throws Exception {
    JobConf conf = getHadoopConf();
    FileSystem fs = FileSystem.get(conf);
    try {
        OutputStream os = fs.create(new Path(getHadoopTestDir(), "foo"));
        os.write(new byte[] { 1 });
        os.close();
        InputStream is = fs.open(new Path(getHadoopTestDir(), "foo"));
        Assert.assertEquals(is.read(), 1);
        Assert.assertEquals(is.read(), -1);
        is.close();
    } finally {
        fs.close();
    }
}