Example usage for org.apache.hadoop.io.compress DefaultCodec DefaultCodec

Introduction

In this page you can find the example usage for org.apache.hadoop.io.compress DefaultCodec DefaultCodec.

Prototype

DefaultCodec

Source Link

Usage

From source file:org.apache.jena.hadoop.rdf.io.input.compressed.rdfxml.DeflatedRdfXmlInputTest.java

License:Apache License

/**
 * Creates new tests
 */
public DeflatedRdfXmlInputTest() {
    super(".rdf.deflate", new DefaultCodec());
}

From source file:org.apache.jena.hadoop.rdf.io.input.compressed.thrift.DeflatedThriftQuadInputTest.java

License:Apache License

/**
 * Creates new tests
 */
public DeflatedThriftQuadInputTest() {
    super(".trdf.deflate", new DefaultCodec());
}

From source file:org.apache.jena.hadoop.rdf.io.input.compressed.thrift.DeflatedThriftTripleInputTest.java

License:Apache License

/**
 * Creates new tests
 */
public DeflatedThriftTripleInputTest() {
    super(".trdf.deflate", new DefaultCodec());
}

From source file:org.apache.jena.hadoop.rdf.io.input.compressed.trig.DeflatedTriGInputTest.java

License:Apache License

/**
 * Creates new tests
 */
public DeflatedTriGInputTest() {
    super(".trig.deflate", new DefaultCodec());
}

From source file:org.apache.jena.hadoop.rdf.io.input.compressed.trix.DeflatedTriXInputTest.java

License:Apache License

/**
 * Creates new tests
 */
public DeflatedTriXInputTest() {
    super(".trix.deflate", new DefaultCodec());
}

From source file:org.apache.jena.hadoop.rdf.io.input.compressed.turtle.DeflatedTurtleInputTest.java

License:Apache License

/**
 * Creates new tests
 */
public DeflatedTurtleInputTest() {
    super(".nt.deflate", new DefaultCodec());
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.LocalSSVDPCADenseTest.java

License:Apache License

public void runSSVDSolver(int q) throws IOException {

    Configuration conf = new Configuration();
    conf.set("mapred.job.tracker", "local");
    conf.set("fs.default.name", "file:///");

    // conf.set("mapred.job.tracker","localhost:11011");
    // conf.set("fs.default.name","hdfs://localhost:11010/");

    Deque<Closeable> closeables = new LinkedList<Closeable>();
    Random rnd = RandomUtils.getRandom();

    File tmpDir = getTestTempDir("svdtmp");
    conf.set("hadoop.tmp.dir", tmpDir.getAbsolutePath());

    Path aLocPath = new Path(getTestTempDirPath("svdtmp/A"), "A.seq");

    // create distributed row matrix-like struct
    SequenceFile.Writer w = SequenceFile.createWriter(FileSystem.getLocal(conf), conf, aLocPath,
            IntWritable.class, VectorWritable.class, CompressionType.BLOCK, new DefaultCodec());
    closeables.addFirst(w);//  w  w w.j ava 2  s .  c  o  m

    int n = 100;
    int m = 2000;
    double percent = 5;

    VectorWritable vw = new VectorWritable();
    IntWritable roww = new IntWritable();

    Vector xi = new DenseVector(n);

    double muAmplitude = 50.0;
    for (int i = 0; i < m; i++) {
        Vector dv = new SequentialAccessSparseVector(n);
        for (int j = 0; j < n * percent / 100; j++) {
            dv.setQuick(rnd.nextInt(n), muAmplitude * (rnd.nextDouble() - 0.25));
        }
        roww.set(i);
        vw.set(dv);
        w.append(roww, vw);
        xi.assign(dv, Functions.PLUS);
    }
    closeables.remove(w);
    Closeables.close(w, true);

    xi.assign(Functions.mult(1 / m));

    FileSystem fs = FileSystem.get(conf);

    Path tempDirPath = getTestTempDirPath("svd-proc");
    Path aPath = new Path(tempDirPath, "A/A.seq");
    fs.copyFromLocalFile(aLocPath, aPath);
    Path xiPath = new Path(tempDirPath, "xi/xi.seq");
    SSVDHelper.saveVector(xi, xiPath, conf);

    Path svdOutPath = new Path(tempDirPath, "SSVD-out");

    // make sure we wipe out previous test results, just a convenience
    fs.delete(svdOutPath, true);

    // Solver starts here:
    System.out.println("Input prepared, starting solver...");

    int ablockRows = 867;
    int p = 60;
    int k = 40;
    SSVDSolver ssvd = new SSVDSolver(conf, new Path[] { aPath }, svdOutPath, ablockRows, k, p, 3);
    ssvd.setOuterBlockHeight(500);
    ssvd.setAbtBlockHeight(251);
    ssvd.setPcaMeanPath(xiPath);

    /*
     * removing V,U jobs from this test to reduce running time. i will keep them
     * put in the dense test though.
     */
    ssvd.setComputeU(false);
    ssvd.setComputeV(false);

    ssvd.setOverwrite(true);
    ssvd.setQ(q);
    ssvd.setBroadcast(true);
    ssvd.run();

    Vector stochasticSValues = ssvd.getSingularValues();
    System.out.println("--SSVD solver singular values:");
    LocalSSVDSolverSparseSequentialTest.dumpSv(stochasticSValues);
    System.out.println("--Colt SVD solver singular values:");

    // try to run the same thing without stochastic algo
    double[][] a = SSVDHelper.loadDistributedRowMatrix(fs, aPath, conf);

    // subtract pseudo pca mean
    for (int i = 0; i < m; i++)
        for (int j = 0; j < n; j++)
            a[i][j] -= xi.getQuick(j);

    SingularValueDecomposition svd2 = new SingularValueDecomposition(new DenseMatrix(a));

    Vector svalues2 = new DenseVector(svd2.getSingularValues());
    LocalSSVDSolverSparseSequentialTest.dumpSv(svalues2);

    for (int i = 0; i < k + p; i++) {
        assertTrue(Math.abs(svalues2.getQuick(i) - stochasticSValues.getQuick(i)) <= s_epsilon);
    }

    double[][] mQ = SSVDHelper.loadDistributedRowMatrix(fs,
            new Path(svdOutPath, "Bt-job/" + BtJob.OUTPUT_Q + "-*"), conf);

    SSVDCommonTest.assertOrthonormality(new DenseMatrix(mQ), false, s_epsilon);

}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.LocalSSVDPCASparseTest.java

License:Apache License

public void runSSVDSolver(int q) throws IOException {

    Configuration conf = new Configuration();
    conf.set("mapred.job.tracker", "local");
    conf.set("fs.default.name", "file:///");

    // conf.set("mapred.job.tracker","localhost:11011");
    // conf.set("fs.default.name","hdfs://localhost:11010/");

    Deque<Closeable> closeables = Lists.newLinkedList();
    try {/*from  w w w.  j  a  v  a 2s.  c  o  m*/
        Random rnd = RandomUtils.getRandom();

        File tmpDir = getTestTempDir("svdtmp");
        conf.set("hadoop.tmp.dir", tmpDir.getAbsolutePath());

        Path aLocPath = new Path(getTestTempDirPath("svdtmp/A"), "A.seq");

        // create distributed row matrix-like struct
        SequenceFile.Writer w = SequenceFile.createWriter(FileSystem.getLocal(conf), conf, aLocPath, Text.class,
                VectorWritable.class, CompressionType.BLOCK, new DefaultCodec());
        closeables.addFirst(w);

        int n = 100;
        int m = 2000;
        double percent = 5;

        VectorWritable vw = new VectorWritable();
        Text rkey = new Text();

        Vector xi = new DenseVector(n);

        double muAmplitude = 50.0;
        for (int i = 0; i < m; i++) {
            Vector dv = new SequentialAccessSparseVector(n);
            String rowname = "row-" + i;
            NamedVector namedRow = new NamedVector(dv, rowname);
            for (int j = 0; j < n * percent / 100; j++) {
                dv.setQuick(rnd.nextInt(n), muAmplitude * (rnd.nextDouble() - 0.25));
            }
            rkey.set("row-i" + i);
            vw.set(namedRow);
            w.append(rkey, vw);
            xi.assign(dv, Functions.PLUS);
        }
        closeables.remove(w);
        Closeables.close(w, false);

        xi.assign(Functions.mult(1.0 / m));

        FileSystem fs = FileSystem.get(conf);

        Path tempDirPath = getTestTempDirPath("svd-proc");
        Path aPath = new Path(tempDirPath, "A/A.seq");
        fs.copyFromLocalFile(aLocPath, aPath);
        Path xiPath = new Path(tempDirPath, "xi/xi.seq");
        SSVDHelper.saveVector(xi, xiPath, conf);

        Path svdOutPath = new Path(tempDirPath, "SSVD-out");

        // make sure we wipe out previous test results, just a convenience
        fs.delete(svdOutPath, true);

        // Solver starts here:
        System.out.println("Input prepared, starting solver...");

        int ablockRows = 867;
        int p = 60;
        int k = 40;
        SSVDSolver ssvd = new SSVDSolver(conf, new Path[] { aPath }, svdOutPath, ablockRows, k, p, 3);
        ssvd.setOuterBlockHeight(500);
        ssvd.setAbtBlockHeight(251);
        ssvd.setPcaMeanPath(xiPath);

        /*
         * Removing V,U jobs from this test to reduce running time. i will keep them
         * put in the dense test though.
         *
         * For PCA test, we also want to request U*Sigma output and check it for named
         * vector propagation.
         */
        ssvd.setComputeU(false);
        ssvd.setComputeV(false);
        ssvd.setcUSigma(true);

        ssvd.setOverwrite(true);
        ssvd.setQ(q);
        ssvd.setBroadcast(true);
        ssvd.run();

        Vector stochasticSValues = ssvd.getSingularValues();

        // try to run the same thing without stochastic algo
        Matrix a = SSVDHelper.drmLoadAsDense(fs, aPath, conf);

        verifyInternals(svdOutPath, a, new Omega(ssvd.getOmegaSeed(), k + p), k + p, q);

        // subtract pseudo pca mean
        for (int i = 0; i < m; i++) {
            a.viewRow(i).assign(xi, Functions.MINUS);
        }

        SingularValueDecomposition svd2 = new SingularValueDecomposition(a);

        Vector svalues2 = new DenseVector(svd2.getSingularValues());

        System.out.println("--SSVD solver singular values:");
        LocalSSVDSolverSparseSequentialTest.dumpSv(stochasticSValues);
        System.out.println("--SVD solver singular values:");
        LocalSSVDSolverSparseSequentialTest.dumpSv(svalues2);

        for (int i = 0; i < k + p; i++) {
            assertTrue(Math.abs(svalues2.getQuick(i) - stochasticSValues.getQuick(i)) <= s_epsilon);
        }

        DenseMatrix mQ = SSVDHelper.drmLoadAsDense(fs, new Path(svdOutPath, "Bt-job/" + BtJob.OUTPUT_Q + "-*"),
                conf);

        SSVDCommonTest.assertOrthonormality(mQ, false, s_epsilon);

        // assert name propagation
        for (Iterator<Pair<Writable, Vector>> iter = SSVDHelper.drmIterator(fs,
                new Path(ssvd.getuSigmaPath() + "/*"), conf, closeables); iter.hasNext();) {
            Pair<Writable, Vector> pair = iter.next();
            Writable key = pair.getFirst();
            Vector v = pair.getSecond();

            assertTrue(v instanceof NamedVector);
            assertTrue(key instanceof Text);
        }

    } finally {
        IOUtils.close(closeables);
    }
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.LocalSSVDSolverSparseSequentialTest.java

License:Apache License

public void runSSVDSolver(int q) throws IOException {

    Configuration conf = getConfiguration();
    conf.set("mapred.job.tracker", "local");
    conf.set("fs.default.name", "file:///");

    // conf.set("mapred.job.tracker","localhost:11011");
    // conf.set("fs.default.name","hdfs://localhost:11010/");

    Deque<Closeable> closeables = Lists.newLinkedList();
    ;//from  ww w .  jav  a2s  .c  o m
    Random rnd = RandomUtils.getRandom();

    File tmpDir = getTestTempDir("svdtmp");
    conf.set("hadoop.tmp.dir", tmpDir.getAbsolutePath());

    Path aLocPath = new Path(getTestTempDirPath("svdtmp/A"), "A.seq");

    // create distributed row matrix-like struct
    SequenceFile.Writer w = SequenceFile.createWriter(FileSystem.getLocal(conf), conf, aLocPath,
            IntWritable.class, VectorWritable.class, CompressionType.BLOCK, new DefaultCodec());
    closeables.addFirst(w);

    int n = 100;
    int m = 2000;
    double percent = 5;

    VectorWritable vw = new VectorWritable();
    IntWritable roww = new IntWritable();

    double muAmplitude = 50.0;
    for (int i = 0; i < m; i++) {
        Vector dv = new SequentialAccessSparseVector(n);
        for (int j = 0; j < n * percent / 100; j++) {
            dv.setQuick(rnd.nextInt(n), muAmplitude * (rnd.nextDouble() - 0.5));
        }
        roww.set(i);
        vw.set(dv);
        w.append(roww, vw);
    }
    closeables.remove(w);
    Closeables.close(w, false);

    FileSystem fs = FileSystem.get(aLocPath.toUri(), conf);

    Path tempDirPath = getTestTempDirPath("svd-proc");
    Path aPath = new Path(tempDirPath, "A/A.seq");
    fs.copyFromLocalFile(aLocPath, aPath);

    Path svdOutPath = new Path(tempDirPath, "SSVD-out");

    // make sure we wipe out previous test results, just a convenience
    fs.delete(svdOutPath, true);

    // Solver starts here:
    System.out.println("Input prepared, starting solver...");

    int ablockRows = 867;
    int p = 60;
    int k = 40;
    SSVDSolver ssvd = new SSVDSolver(conf, new Path[] { aPath }, svdOutPath, ablockRows, k, p, 3);
    ssvd.setOuterBlockHeight(500);
    ssvd.setAbtBlockHeight(251);

    /*
     * removing V,U jobs from this test to reduce running time. i will keep them
     * put in the dense test though.
     */
    ssvd.setComputeU(false);
    ssvd.setComputeV(false);

    ssvd.setOverwrite(true);
    ssvd.setQ(q);
    ssvd.setBroadcast(true);
    ssvd.run();

    Vector stochasticSValues = ssvd.getSingularValues();
    System.out.println("--SSVD solver singular values:");
    dumpSv(stochasticSValues);
    System.out.println("--Colt SVD solver singular values:");

    // try to run the same thing without stochastic algo
    DenseMatrix a = SSVDHelper.drmLoadAsDense(fs, aPath, conf);

    // SingularValueDecompositionImpl svd=new SingularValueDecompositionImpl(new
    // Array2DRowRealMatrix(a));
    SingularValueDecomposition svd2 = new SingularValueDecomposition(a);

    Vector svalues2 = new DenseVector(svd2.getSingularValues());
    dumpSv(svalues2);

    for (int i = 0; i < k + p; i++) {
        assertTrue(Math.abs(svalues2.getQuick(i) - stochasticSValues.getQuick(i)) <= s_epsilon);
    }

    DenseMatrix mQ = SSVDHelper.drmLoadAsDense(fs, new Path(svdOutPath, "Bt-job/" + BtJob.OUTPUT_Q + "-*"),
            conf);

    SSVDCommonTest.assertOrthonormality(mQ, false, s_epsilon);

    IOUtils.close(closeables);
}

From source file:org.apache.pig.piggybank.test.storage.TestHiveColumnarLoader.java

License:Apache License

private static void produceDatePartitionedData() throws IOException {
    datePartitionedRowCount = 0;// ww  w. j  a va2s  . co m
    datePartitionedDir = new File("testhiveColumnarLoader-dateDir-" + System.currentTimeMillis());
    datePartitionedDir.mkdir();
    datePartitionedDir.deleteOnExit();

    int dates = 4;
    calendar = Calendar.getInstance();

    calendar.set(Calendar.DAY_OF_MONTH, Calendar.MONDAY);
    calendar.set(Calendar.MONTH, Calendar.JANUARY);

    startingDate = dateFormat.format(calendar.getTime());

    datePartitionedRCFiles = new ArrayList<String>();
    datePartitionedDirs = new ArrayList<String>();

    for (int i = 0; i < dates; i++) {

        File file = new File(datePartitionedDir, "daydate=" + dateFormat.format(calendar.getTime()));
        calendar.add(Calendar.DAY_OF_MONTH, 1);

        file.mkdir();
        file.deleteOnExit();

        // for each daydate write 5 partitions
        for (int pi = 0; pi < 5; pi++) {
            Path path = new Path(new Path(file.getAbsolutePath()), "parition" + pi);

            datePartitionedRowCount += writeRCFileTest(fs, simpleRowCount, path, columnCount,
                    new DefaultCodec(), columnCount);

            new File(path.toString()).deleteOnExit();
            datePartitionedRCFiles.add(path.toString());
            datePartitionedDirs.add(file.toString());

        }

    }

    endingDate = dateFormat.format(calendar.getTime());
}