Example usage for org.apache.hadoop.fs FileSystem open

List of usage examples for org.apache.hadoop.fs FileSystem open

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem open.

Prototype

public FSDataInputStream open(PathHandle fd) throws IOException 

Source Link

Document

Open an FSDataInputStream matching the PathHandle instance.

Usage

From source file:com.ML_Hadoop.MultipleLinearRegression.MultipleLinearRegressionMapReduce.java

public static void main(String[] args) throws Exception {
    String[] theta;//w  ww.j a va 2  s  . c  o m
    int iteration = 0, num_of_iteration = 1;
    int feature_size = 0, input_data_size = 0;
    FileSystem fs;
    Float alpha = 0.1f;

    do {
        Configuration conf = new Configuration();
        fs = FileSystem.get(conf);

        Job job = new Job(conf, "LinearRegressionMapReduce");
        job.setJarByClass(MultipleLinearRegressionMapReduce.class);

        // the following two lines are needed for propagating "theta"
        conf = job.getConfiguration();

        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(FloatWritable.class);

        job.setMapperClass(MultipleLinearRegressionMap.class);
        job.setReducerClass(MultipleLinearRegressionReduce.class);

        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        job.setNumReduceTasks(1); // set mapred.reduce.tasks = 1 (only one reducer)

        FileInputFormat.addInputPath(job, new Path(args[0]));
        Path out = new Path(args[1]);
        if (fs.exists(out))
            fs.delete(out, true);

        FileOutputFormat.setOutputPath(job, out);
        alpha = Float.parseFloat(args[2]);
        num_of_iteration = Integer.parseInt(args[3]);
        feature_size = Integer.parseInt(args[4]);
        input_data_size = Integer.parseInt(args[5]);
        conf.setFloat("alpha", alpha);
        conf.setInt("feature_size", feature_size);
        conf.setInt("input_data_size", input_data_size);
        conf.setInt("iteration", iteration);

        theta = new String[feature_size];

        if (iteration == 0) { // first iteration
            for (int i = 0; i < theta.length; i++)
                theta[i] = "0.0";
            conf.setStrings("theta", theta);
        } else {
            try {
                String uri = "/user/hduser/theta.txt";
                fs = FileSystem.get(conf);
                //FSDataInputStream in = fs.open(new Path(uri));
                BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(uri))));
                theta = br.readLine().split(",");
            } catch (Exception e) {

            }
            conf.setStrings("theta", theta);
        }

        for (int i = 0; i < theta.length; i++)
            System.out.println("In MapRedce main function: theta[ " + i + " ]" + theta[i]);

        try {
            job.waitForCompletion(true);
            iteration++;
        } catch (IOException e) {
            e.printStackTrace();
        }
    } while (iteration < num_of_iteration);

}

From source file:com.mongodb.hadoop.splitter.BSONSplitter.java

License:Apache License

public void loadSplitsFromSplitFile(final FileStatus inputFile, final Path splitFile)
        throws NoSplitFileException, IOException {
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
    FileSystem fs = splitFile.getFileSystem(getConf()); // throws IOException
    FileStatus splitFileStatus;// ww w.  ja  v  a 2 s . co  m
    try {
        splitFileStatus = fs.getFileStatus(splitFile);
        LOG.info("Found split file at : " + splitFileStatus);
    } catch (Exception e) {
        throw new NoSplitFileException();
    }
    FSDataInputStream fsDataStream = fs.open(splitFile); // throws IOException
    while (fsDataStream.getPos() < splitFileStatus.getLen()) {
        callback.reset();
        bsonDec.decode(fsDataStream, callback);
        BSONObject splitInfo = (BSONObject) callback.get();
        splits.add(createFileSplitFromBSON(splitInfo, fs, inputFile));
    }
    splitsList = splits;
}

From source file:com.mongodb.hadoop.splitter.BSONSplitter.java

License:Apache License

public void readSplitsForFile(final FileStatus file) throws IOException {
    Path path = file.getPath();/*  w ww  .ja  v  a 2s. c om*/
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>();
    FileSystem fs = path.getFileSystem(getConf());
    long length = file.getLen();
    if (!getConf().getBoolean("bson.split.read_splits", true)) {
        LOG.info("Reading splits is disabled - constructing single split for " + file);
        FileSplit onesplit = createFileSplit(file, fs, 0, length);
        splits.add(onesplit);
        splitsList = splits;
        return;
    }
    if (length != 0) {
        int numDocsRead = 0;
        long splitSize = getSplitSize(getConf(), file);
        if (LOG.isDebugEnabled()) {
            LOG.debug("Generating splits for " + path + " of up to " + splitSize + " bytes.");
        }
        FSDataInputStream fsDataStream = fs.open(path);
        long curSplitLen = 0;
        long curSplitStart = 0;
        try {
            while (fsDataStream.getPos() + 1 < length) {
                lazyCallback.reset();
                lazyDec.decode(fsDataStream, lazyCallback);
                LazyBSONObject bo = (LazyBSONObject) lazyCallback.get();
                int bsonDocSize = bo.getBSONSize();
                if (curSplitLen + bsonDocSize >= splitSize) {
                    FileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen);
                    splits.add(split);
                    if (LOG.isDebugEnabled()) {
                        LOG.debug(String.format("Creating new split (%d) %s", splits.size(), split));
                    }
                    curSplitStart = fsDataStream.getPos() - bsonDocSize;
                    curSplitLen = 0;
                }
                curSplitLen += bsonDocSize;
                numDocsRead++;
                if (numDocsRead % 1000 == 0) {
                    float splitProgress = 100f * ((float) fsDataStream.getPos() / length);
                    if (LOG.isDebugEnabled()) {
                        LOG.debug(String.format("Read %d docs calculating splits for %s; %3.3f%% complete.",
                                numDocsRead, file.getPath(), splitProgress));
                    }
                }
            }
            if (curSplitLen > 0) {
                FileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen);
                splits.add(split);
                if (LOG.isDebugEnabled()) {
                    LOG.debug(String.format("Final split (%d) %s", splits.size(), split.getPath()));
                }
            }
            splitsList = splits;
            if (LOG.isDebugEnabled()) {
                LOG.debug("Completed splits calculation for " + file.getPath());
            }
            writeSplits();
        } catch (IOException e) {
            LOG.warn("IOException: " + e);
        } finally {
            fsDataStream.close();
        }
    } else {
        LOG.warn("Zero-length file, skipping split calculation.");
    }
}

From source file:com.moz.fiji.mapreduce.input.impl.WholeFileRecordReader.java

License:Apache License

/**
 * <p>If the file has not already been read, this reads it into memory, so that a call
 * to getCurrentValue() will return the entire contents of this file as Text,
 * and getCurrentKey() will return the qualified path to this file as Text.  Then, returns
 * true.  If it has already been read, then returns false without updating any internal state.</p>
 *
 * @return Whether the file was read or not.
 * @throws IOException if there is an error reading the file.
 * @throws InterruptedException if there is an error.
 *///from   w ww .  j av  a  2s  . c  o m
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
    if (!mProcessed) {
        if (mFileLength > (long) Integer.MAX_VALUE) {
            throw new IOException("File is longer than Integer.MAX_VALUE.");
        }
        byte[] contents = new byte[(int) mFileLength];

        FileSystem fs = mFileToRead.getFileSystem(mConf);
        FSDataInputStream in = null;
        try {
            // Set the contents of this file.
            in = fs.open(mFileToRead);
            IOUtils.readFully(in, contents, 0, contents.length);
            mFileText.set(contents, 0, contents.length);

            // Set the name of this file.
            String fileName = mFileToRead.makeQualified(fs).toString();
            mFileName.set(fileName);
        } finally {
            IOUtils.closeStream(in);
        }
        mProcessed = true;
        return true;
    }
    return false;
}

From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java

License:Apache License

/** Test FijiTableInputFormat in a map-only job. */
@Test//from   www  .  ja  v  a  2 s  . c om
public void testMapJob() throws Exception {
    final Path outputFile = createOutputFile();
    // Create a test job.
    final Job job = setupJob("testMapJob", outputFile, TestMapper.class, null, // reducer class
            null, // start key
            null, // limit key
            null); // filter

    // Run the job.
    assertTrue("Hadoop job failed", job.waitForCompletion(true));

    // Check to make sure output exists.
    final FileSystem fs = FileSystem.get(job.getConfiguration());
    assertTrue(fs.exists(outputFile.getParent()));

    // Verify that the output matches what's expected.
    final FSDataInputStream in = fs.open(outputFile);
    final Set<String> actual = Sets.newHashSet(IOUtils.toString(in).trim().split("\n"));
    final Set<String> expected = Sets.newHashSet("usermail.example.com\tAaron Kimball", "gmail.com\tJohn Doe",
            "usermail.example.com\tChristophe Bisciglia", "usermail.example.com\tKiyan Ahmadizadeh",
            "gmail.com\tJane Doe", "usermail.example.com\tGarrett Wu");
    assertEquals("Result of job wasn't what was expected", expected, actual);

    // Clean up.
    fs.delete(outputFile.getParent(), true);

    IOUtils.closeQuietly(in);
    // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that
    // causes it to close other thread's filesystem objects. For more information
    // see: https://issues.apache.org/jira/browse/HADOOP-7973
}

From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java

License:Apache License

/** Test FijiTableInputFormat in a map-only job with start and limit keys. */
@Test/*from w w  w  .j  a v a2 s . c om*/
public void testMapJobWithStartAndLimitKeys() throws Exception {
    final Path outputFile = createOutputFile();
    // Set the same entity IDs for start and limit, and we should get just the start row
    final EntityId startEntityId = getFooTable().getEntityId("jane.doe@gmail.com");
    final byte[] endRowKey = startEntityId.getHBaseRowKey();
    final EntityId rawLimitEntityId = HBaseEntityId
            .fromHBaseRowKey(Arrays.copyOf(endRowKey, endRowKey.length + 1));

    // Create a test job.
    final Job job = setupJob("testMapJobWithStartAndLimitKeys", outputFile, TestMapper.class, null, // reducer class
            startEntityId, rawLimitEntityId, null); // filter

    // Run the job.
    assertTrue("Hadoop job failed", job.waitForCompletion(true));

    // Check to make sure output exists.
    final FileSystem fs = FileSystem.get(job.getConfiguration());
    assertTrue(fs.exists(outputFile.getParent()));

    // Verify that the output matches what's expected.
    final FSDataInputStream in = fs.open(outputFile);
    final Set<String> actual = Sets.newHashSet(IOUtils.toString(in).trim().split("\n"));
    final Set<String> expected = Sets.newHashSet("gmail.com\tJane Doe");
    assertEquals("Result of job wasn't what was expected", expected, actual);

    // Clean up.
    fs.delete(outputFile.getParent(), true);

    IOUtils.closeQuietly(in);
    // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that
    // causes it to close other thread's filesystem objects. For more information
    // see: https://issues.apache.org/jira/browse/HADOOP-7973
}

From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java

License:Apache License

/** Test FijiTableInputFormat in a map-only job with a row filter. */
@Test//  www  .  ja  va  2  s  .com
public void testMapJobWithFilter() throws Exception {
    final FijiRowFilter filter = new ColumnValueEqualsRowFilter("info", "email",
            new DecodedCell<String>(Schema.create(Schema.Type.STRING), "aaron@usermail.example.com"));
    final Path outputFile = createOutputFile();
    // Create a test job.
    final Job job = setupJob("testMapJobWithFilter", outputFile, TestMapper.class, null, // reducer class
            null, // start key
            null, // limit key
            filter);

    // Run the job.
    assertTrue("Hadoop job failed", job.waitForCompletion(true));

    // Check to make sure output exists.
    final FileSystem fs = FileSystem.get(job.getConfiguration());
    assertTrue(fs.exists(outputFile.getParent()));

    // Verify that the output matches what's expected.
    final FSDataInputStream in = fs.open(outputFile);
    final Set<String> actual = Sets.newHashSet(IOUtils.toString(in).trim().split("\n"));
    final Set<String> expected = Sets.newHashSet("usermail.example.com\tAaron Kimball");
    assertEquals("Result of job wasn't what was expected", expected, actual);

    // Clean up.
    fs.delete(outputFile.getParent(), true);

    IOUtils.closeQuietly(in);
    // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that
    // causes it to close other thread's filesystem objects. For more information
    // see: https://issues.apache.org/jira/browse/HADOOP-7973
}

From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java

License:Apache License

/** Test FijiTableInputFormat in a MapReduce job. */
@Test//  w w w  .j  ava 2s.  com
public void testMapReduceJob() throws Exception {
    final Path outputFile = createOutputFile();
    // Create a test job.
    final Job job = setupJob("testMapReduceJob", outputFile, TestMapper.class, TestReducer.class, null, // start key
            null, // limit key
            null); // filter

    // Run the job.
    assertTrue("Hadoop job failed", job.waitForCompletion(true));

    // Check to make sure output exists.
    final FileSystem fs = FileSystem.get(job.getConfiguration());
    assertTrue(fs.exists(outputFile.getParent()));

    // Verify that the output matches what's expected.
    final FSDataInputStream in = fs.open(outputFile);
    final Set<String> output = Sets.newHashSet(IOUtils.toString(in).trim().split("\n"));
    final ImmutableMap.Builder<String, Set<String>> builder = ImmutableMap.builder();
    for (String line : output) {
        final String[] keyValue = line.split("\t");
        final String emailDomain = keyValue[0];
        final Set<String> names = Sets.newHashSet(keyValue[1].split(","));

        builder.put(emailDomain, names);
    }
    final Map<String, Set<String>> actual = builder.build();
    final Map<String, Set<String>> expected = ImmutableMap.<String, Set<String>>builder()
            .put("usermail.example.com",
                    Sets.newHashSet("Aaron Kimball", "Christophe Bisciglia", "Kiyan Ahmadizadeh", "Garrett Wu"))
            .put("gmail.com", Sets.newHashSet("John Doe", "Jane Doe")).build();
    assertEquals("Result of job wasn't what was expected", expected, actual);

    // Clean up.
    fs.delete(outputFile.getParent(), true);

    IOUtils.closeQuietly(in);
    // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that
    // causes it to close other thread's filesystem objects. For more information
    // see: https://issues.apache.org/jira/browse/HADOOP-7973
}

From source file:com.moz.fiji.mapreduce.lib.bulkimport.DescribedInputTextBulkImporter.java

License:Apache License

/**
 * Sets the path to the text input descriptor file and parses it.
 *
 * @param inputDescriptorFile The input descriptor path.
 * @throws RuntimeException if there's an error reading or parsing the input descriptor.
 *///from  ww w.j  av  a 2s. c  om
@HadoopConf(key = CONF_FILE, usage = "The input descriptor file.")
protected final void setInputDescriptorPath(String inputDescriptorFile) {

    if (null == inputDescriptorFile || inputDescriptorFile.isEmpty()) {
        // Remind the user to specify this path.
        LOG.error("No input-descriptor path specified.");
        throw new RuntimeException("No input descriptor file specified on the Configuration."
                + "  Did you specify the " + CONF_FILE + " variable?");
    }

    Path descriptorPath = new Path(inputDescriptorFile);
    try {
        LOG.info("Parsing input-descriptor file: " + descriptorPath.toString());
        FileSystem fs = descriptorPath.getFileSystem(getConf());
        FSDataInputStream inputStream = fs.open(descriptorPath);
        mTableImportDescriptor = FijiTableImportDescriptor.createFromEffectiveJson(inputStream);

    } catch (IOException ioe) {
        LOG.error("Could not read input-descriptor file: " + descriptorPath.toString());
        throw new RuntimeException("Could not read file: " + descriptorPath.toString(), ioe);
    }
}

From source file:com.moz.fiji.schema.tools.CreateTableTool.java

License:Apache License

/** {@inheritDoc} */
@Override/*from   w  w  w .  j a v  a2s.c o m*/
protected int run(List<String> nonFlagArgs) throws Exception {
    getPrintStream().println("Parsing table layout: " + mLayout);
    final Path path = new Path(mLayout);
    final FileSystem fs = fileSystemSpecified(path) ? path.getFileSystem(getConf())
            : FileSystem.getLocal(getConf());
    final FSDataInputStream inputStream = fs.open(path);
    final TableLayoutDesc tableLayout = FijiTableLayout.readTableLayoutDescFromJSON(inputStream);
    final String tableName = tableLayout.getName();
    Preconditions.checkArgument((mTableURI.getTable() == null) || tableName.equals(mTableURI.getTable()),
            "Table name '%s' does not match URI %s", tableName, mTableURI);

    // For large numbers of initial regions, table creation may take a long time as we wait for
    // the new regions to come online. Increase the hbase RPC timeout to compensate.
    int hbaseTimeout = getConf().getInt("hbase.rpc.timeout", 60000);
    hbaseTimeout = hbaseTimeout * 10;
    getConf().setInt("hbase.rpc.timeout", hbaseTimeout);

    getPrintStream().println("Creating Fiji table " + mTableURI);
    if (mNumRegions >= 1) {
        // Create a table with an initial number of evenly split regions.
        mFiji.createTable(tableLayout, mNumRegions);

    } else if (!mSplitKeyFilePath.isEmpty()) {
        switch (FijiTableLayout.getEncoding(tableLayout.getKeysFormat())) {
        case HASH:
        case HASH_PREFIX:
            throw new IllegalArgumentException(
                    "Row key hashing is enabled for the table. Use --num-regions=N instead.");
        case RAW:
            break;
        case FORMATTED:
            // TODO Support pre-splitting tables for FORMATTED RKF
            // (https://jira.fiji.org/browse/SCHEMA-172)
            throw new RuntimeException("CLI support for FORMATTED row keys is not yet available");
        default:
            throw new RuntimeException(
                    "Unexpected row key encoding: " + FijiTableLayout.getEncoding(tableLayout.getKeysFormat()));
        }
        // Open the split key file.
        final Path splitKeyFilePath = new Path(mSplitKeyFilePath);
        final FileSystem splitKeyPathFs = fileSystemSpecified(splitKeyFilePath)
                ? splitKeyFilePath.getFileSystem(getConf())
                : FileSystem.getLocal(getConf());
        final FSDataInputStream splitKeyFileInputStream = splitKeyPathFs.open(splitKeyFilePath);

        // Read the split keys.
        final List<byte[]> splitKeys = SplitKeyFile.decodeRegionSplitList(splitKeyFileInputStream);
        LOG.debug("Read {} keys from split-key-file '{}':", splitKeys.size(), splitKeyFilePath);
        for (int i = 0; i < splitKeys.size(); ++i) {
            LOG.debug("Split key #{}: {}", i, Bytes.toStringBinary(splitKeys.get(i)));
        }

        // Create the table with the given split keys.
        mFiji.createTable(tableLayout, splitKeys.toArray(new byte[splitKeys.size()][]));

    } else {
        // Create a table with a single initial region:
        mFiji.createTable(tableLayout);
    }

    return SUCCESS;
}