Example usage for org.apache.hadoop.io SequenceFile createWriter

Introduction

In this page you can find the example usage for org.apache.hadoop.io SequenceFile createWriter.

Prototype

@Deprecated
public static Writer createWriter(Configuration conf, FSDataOutputStream out, Class keyClass, Class valClass,
        CompressionType compressionType, CompressionCodec codec, Metadata metadata) throws IOException

Source Link

Document

Construct the preferred type of 'raw' SequenceFile Writer.

Usage

From source file:gobblin.metastore.FsStateStore.java

License:Apache License

/**
 * See {@link StateStore#putAll(String, String, Collection)}.
 *
 * <p>//w  w w  .  jav a  2  s.  co  m
 *   This implementation does not support putting the state objects into an existing store as
 *   append is to be supported by the Hadoop SequenceFile (HADOOP-7139).
 * </p>
 */
@Override
public void putAll(String storeName, String tableName, Collection<T> states) throws IOException {
    String tmpTableName = this.useTmpFileForPut ? TMP_FILE_PREFIX + tableName : tableName;
    Path tmpTablePath = new Path(new Path(this.storeRootDir, storeName), tmpTableName);

    if (!this.fs.exists(tmpTablePath) && !create(storeName, tmpTableName)) {
        throw new IOException("Failed to create a state file for table " + tmpTableName);
    }

    Closer closer = Closer.create();
    try {
        @SuppressWarnings("deprecation")
        SequenceFile.Writer writer = closer.register(SequenceFile.createWriter(this.fs, this.conf, tmpTablePath,
                Text.class, this.stateClass, SequenceFile.CompressionType.BLOCK, new DefaultCodec()));
        for (T state : states) {
            writer.append(new Text(Strings.nullToEmpty(state.getId())), state);
        }
    } catch (Throwable t) {
        throw closer.rethrow(t);
    } finally {
        closer.close();
    }

    if (this.useTmpFileForPut) {
        Path tablePath = new Path(new Path(this.storeRootDir, storeName), tableName);
        HadoopUtils.renamePath(this.fs, tmpTablePath, tablePath);
    }
}

From source file:io.druid.indexer.IndexGeneratorJobTest.java

License:Apache License

private void writeDataToLocalSequenceFile(File outputFile, List<String> data) throws IOException {
    Configuration conf = new Configuration();
    LocalFileSystem fs = FileSystem.getLocal(conf);
    Writer fileWriter = SequenceFile.createWriter(fs, conf, new Path(outputFile.getAbsolutePath()),
            BytesWritable.class, BytesWritable.class, SequenceFile.CompressionType.NONE,
            (CompressionCodec) null);//  www .j  a  va2  s.c om

    int keyCount = 10;
    for (String line : data) {
        ByteBuffer buf = ByteBuffer.allocate(4);
        buf.putInt(keyCount);
        BytesWritable key = new BytesWritable(buf.array());
        BytesWritable value = new BytesWritable(line.getBytes(Charsets.UTF_8));
        fileWriter.append(key, value);
        keyCount += 1;
    }

    fileWriter.close();
}

From source file:org.apache.druid.indexer.IndexGeneratorJobTest.java

License:Apache License

private void writeDataToLocalSequenceFile(File outputFile, List<String> data) throws IOException {
    Configuration conf = new Configuration();
    LocalFileSystem fs = FileSystem.getLocal(conf);
    Writer fileWriter = SequenceFile.createWriter(fs, conf, new Path(outputFile.getAbsolutePath()),
            BytesWritable.class, BytesWritable.class, SequenceFile.CompressionType.NONE,
            (CompressionCodec) null);/*from  w w  w  .  j  a  v a 2s.com*/

    int keyCount = 10;
    for (String line : data) {
        ByteBuffer buf = ByteBuffer.allocate(4);
        buf.putInt(keyCount);
        BytesWritable key = new BytesWritable(buf.array());
        BytesWritable value = new BytesWritable(StringUtils.toUtf8(line));
        fileWriter.append(key, value);
        keyCount += 1;
    }

    fileWriter.close();
}

From source file:org.apache.gobblin.metastore.FsStateStore.java

License:Apache License

/**
 * See {@link StateStore#put(String, String, T)}.
 *
 * <p>//  w  ww  .j  a v a  2 s.  c  o  m
 *   This implementation does not support putting the state object into an existing store as
 *   append is to be supported by the Hadoop SequenceFile (HADOOP-7139).
 * </p>
 */
@Override
public void put(String storeName, String tableName, T state) throws IOException {
    String tmpTableName = this.useTmpFileForPut ? TMP_FILE_PREFIX + tableName : tableName;
    Path tmpTablePath = new Path(new Path(this.storeRootDir, storeName), tmpTableName);

    if (!this.fs.exists(tmpTablePath) && !create(storeName, tmpTableName)) {
        throw new IOException("Failed to create a state file for table " + tmpTableName);
    }

    Closer closer = Closer.create();
    try {
        @SuppressWarnings("deprecation")
        SequenceFile.Writer writer = closer.register(SequenceFile.createWriter(this.fs, this.conf, tmpTablePath,
                Text.class, this.stateClass, SequenceFile.CompressionType.BLOCK, new DefaultCodec()));
        writer.append(new Text(Strings.nullToEmpty(state.getId())), state);
    } catch (Throwable t) {
        throw closer.rethrow(t);
    } finally {
        closer.close();
    }

    if (this.useTmpFileForPut) {
        Path tablePath = new Path(new Path(this.storeRootDir, storeName), tableName);
        renamePath(tmpTablePath, tablePath);
    }
}

From source file:org.apache.gobblin.metastore.FsStateStore.java

License:Apache License

/**
 * See {@link StateStore#putAll(String, String, Collection)}.
 *
 * <p>/*from  w w w .  j a  v a  2  s  .  c  o  m*/
 *   This implementation does not support putting the state objects into an existing store as
 *   append is to be supported by the Hadoop SequenceFile (HADOOP-7139).
 * </p>
 */
@Override
public void putAll(String storeName, String tableName, Collection<T> states) throws IOException {
    String tmpTableName = this.useTmpFileForPut ? TMP_FILE_PREFIX + tableName : tableName;
    Path tmpTablePath = new Path(new Path(this.storeRootDir, storeName), tmpTableName);

    if (!this.fs.exists(tmpTablePath) && !create(storeName, tmpTableName)) {
        throw new IOException("Failed to create a state file for table " + tmpTableName);
    }

    Closer closer = Closer.create();
    try {
        @SuppressWarnings("deprecation")
        SequenceFile.Writer writer = closer.register(SequenceFile.createWriter(this.fs, this.conf, tmpTablePath,
                Text.class, this.stateClass, SequenceFile.CompressionType.BLOCK, new DefaultCodec()));
        for (T state : states) {
            writer.append(new Text(Strings.nullToEmpty(state.getId())), state);
        }
    } catch (Throwable t) {
        throw closer.rethrow(t);
    } finally {
        closer.close();
    }

    if (this.useTmpFileForPut) {
        Path tablePath = new Path(new Path(this.storeRootDir, storeName), tableName);
        renamePath(tmpTablePath, tablePath);
    }
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.LocalSSVDPCADenseTest.java

License:Apache License

public void runSSVDSolver(int q) throws IOException {

    Configuration conf = new Configuration();
    conf.set("mapred.job.tracker", "local");
    conf.set("fs.default.name", "file:///");

    // conf.set("mapred.job.tracker","localhost:11011");
    // conf.set("fs.default.name","hdfs://localhost:11010/");

    Deque<Closeable> closeables = new LinkedList<Closeable>();
    Random rnd = RandomUtils.getRandom();

    File tmpDir = getTestTempDir("svdtmp");
    conf.set("hadoop.tmp.dir", tmpDir.getAbsolutePath());

    Path aLocPath = new Path(getTestTempDirPath("svdtmp/A"), "A.seq");

    // create distributed row matrix-like struct
    SequenceFile.Writer w = SequenceFile.createWriter(FileSystem.getLocal(conf), conf, aLocPath,
            IntWritable.class, VectorWritable.class, CompressionType.BLOCK, new DefaultCodec());
    closeables.addFirst(w);//from ww w  .j av  a  2 s  .com

    int n = 100;
    int m = 2000;
    double percent = 5;

    VectorWritable vw = new VectorWritable();
    IntWritable roww = new IntWritable();

    Vector xi = new DenseVector(n);

    double muAmplitude = 50.0;
    for (int i = 0; i < m; i++) {
        Vector dv = new SequentialAccessSparseVector(n);
        for (int j = 0; j < n * percent / 100; j++) {
            dv.setQuick(rnd.nextInt(n), muAmplitude * (rnd.nextDouble() - 0.25));
        }
        roww.set(i);
        vw.set(dv);
        w.append(roww, vw);
        xi.assign(dv, Functions.PLUS);
    }
    closeables.remove(w);
    Closeables.close(w, true);

    xi.assign(Functions.mult(1 / m));

    FileSystem fs = FileSystem.get(conf);

    Path tempDirPath = getTestTempDirPath("svd-proc");
    Path aPath = new Path(tempDirPath, "A/A.seq");
    fs.copyFromLocalFile(aLocPath, aPath);
    Path xiPath = new Path(tempDirPath, "xi/xi.seq");
    SSVDHelper.saveVector(xi, xiPath, conf);

    Path svdOutPath = new Path(tempDirPath, "SSVD-out");

    // make sure we wipe out previous test results, just a convenience
    fs.delete(svdOutPath, true);

    // Solver starts here:
    System.out.println("Input prepared, starting solver...");

    int ablockRows = 867;
    int p = 60;
    int k = 40;
    SSVDSolver ssvd = new SSVDSolver(conf, new Path[] { aPath }, svdOutPath, ablockRows, k, p, 3);
    ssvd.setOuterBlockHeight(500);
    ssvd.setAbtBlockHeight(251);
    ssvd.setPcaMeanPath(xiPath);

    /*
     * removing V,U jobs from this test to reduce running time. i will keep them
     * put in the dense test though.
     */
    ssvd.setComputeU(false);
    ssvd.setComputeV(false);

    ssvd.setOverwrite(true);
    ssvd.setQ(q);
    ssvd.setBroadcast(true);
    ssvd.run();

    Vector stochasticSValues = ssvd.getSingularValues();
    System.out.println("--SSVD solver singular values:");
    LocalSSVDSolverSparseSequentialTest.dumpSv(stochasticSValues);
    System.out.println("--Colt SVD solver singular values:");

    // try to run the same thing without stochastic algo
    double[][] a = SSVDHelper.loadDistributedRowMatrix(fs, aPath, conf);

    // subtract pseudo pca mean
    for (int i = 0; i < m; i++)
        for (int j = 0; j < n; j++)
            a[i][j] -= xi.getQuick(j);

    SingularValueDecomposition svd2 = new SingularValueDecomposition(new DenseMatrix(a));

    Vector svalues2 = new DenseVector(svd2.getSingularValues());
    LocalSSVDSolverSparseSequentialTest.dumpSv(svalues2);

    for (int i = 0; i < k + p; i++) {
        assertTrue(Math.abs(svalues2.getQuick(i) - stochasticSValues.getQuick(i)) <= s_epsilon);
    }

    double[][] mQ = SSVDHelper.loadDistributedRowMatrix(fs,
            new Path(svdOutPath, "Bt-job/" + BtJob.OUTPUT_Q + "-*"), conf);

    SSVDCommonTest.assertOrthonormality(new DenseMatrix(mQ), false, s_epsilon);

}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.LocalSSVDPCASparseTest.java

License:Apache License

public void runSSVDSolver(int q) throws IOException {

    Configuration conf = new Configuration();
    conf.set("mapred.job.tracker", "local");
    conf.set("fs.default.name", "file:///");

    // conf.set("mapred.job.tracker","localhost:11011");
    // conf.set("fs.default.name","hdfs://localhost:11010/");

    Deque<Closeable> closeables = Lists.newLinkedList();
    try {/*from   ww  w. ja  v a  2  s  .  c o  m*/
        Random rnd = RandomUtils.getRandom();

        File tmpDir = getTestTempDir("svdtmp");
        conf.set("hadoop.tmp.dir", tmpDir.getAbsolutePath());

        Path aLocPath = new Path(getTestTempDirPath("svdtmp/A"), "A.seq");

        // create distributed row matrix-like struct
        SequenceFile.Writer w = SequenceFile.createWriter(FileSystem.getLocal(conf), conf, aLocPath, Text.class,
                VectorWritable.class, CompressionType.BLOCK, new DefaultCodec());
        closeables.addFirst(w);

        int n = 100;
        int m = 2000;
        double percent = 5;

        VectorWritable vw = new VectorWritable();
        Text rkey = new Text();

        Vector xi = new DenseVector(n);

        double muAmplitude = 50.0;
        for (int i = 0; i < m; i++) {
            Vector dv = new SequentialAccessSparseVector(n);
            String rowname = "row-" + i;
            NamedVector namedRow = new NamedVector(dv, rowname);
            for (int j = 0; j < n * percent / 100; j++) {
                dv.setQuick(rnd.nextInt(n), muAmplitude * (rnd.nextDouble() - 0.25));
            }
            rkey.set("row-i" + i);
            vw.set(namedRow);
            w.append(rkey, vw);
            xi.assign(dv, Functions.PLUS);
        }
        closeables.remove(w);
        Closeables.close(w, false);

        xi.assign(Functions.mult(1.0 / m));

        FileSystem fs = FileSystem.get(conf);

        Path tempDirPath = getTestTempDirPath("svd-proc");
        Path aPath = new Path(tempDirPath, "A/A.seq");
        fs.copyFromLocalFile(aLocPath, aPath);
        Path xiPath = new Path(tempDirPath, "xi/xi.seq");
        SSVDHelper.saveVector(xi, xiPath, conf);

        Path svdOutPath = new Path(tempDirPath, "SSVD-out");

        // make sure we wipe out previous test results, just a convenience
        fs.delete(svdOutPath, true);

        // Solver starts here:
        System.out.println("Input prepared, starting solver...");

        int ablockRows = 867;
        int p = 60;
        int k = 40;
        SSVDSolver ssvd = new SSVDSolver(conf, new Path[] { aPath }, svdOutPath, ablockRows, k, p, 3);
        ssvd.setOuterBlockHeight(500);
        ssvd.setAbtBlockHeight(251);
        ssvd.setPcaMeanPath(xiPath);

        /*
         * Removing V,U jobs from this test to reduce running time. i will keep them
         * put in the dense test though.
         *
         * For PCA test, we also want to request U*Sigma output and check it for named
         * vector propagation.
         */
        ssvd.setComputeU(false);
        ssvd.setComputeV(false);
        ssvd.setcUSigma(true);

        ssvd.setOverwrite(true);
        ssvd.setQ(q);
        ssvd.setBroadcast(true);
        ssvd.run();

        Vector stochasticSValues = ssvd.getSingularValues();

        // try to run the same thing without stochastic algo
        Matrix a = SSVDHelper.drmLoadAsDense(fs, aPath, conf);

        verifyInternals(svdOutPath, a, new Omega(ssvd.getOmegaSeed(), k + p), k + p, q);

        // subtract pseudo pca mean
        for (int i = 0; i < m; i++) {
            a.viewRow(i).assign(xi, Functions.MINUS);
        }

        SingularValueDecomposition svd2 = new SingularValueDecomposition(a);

        Vector svalues2 = new DenseVector(svd2.getSingularValues());

        System.out.println("--SSVD solver singular values:");
        LocalSSVDSolverSparseSequentialTest.dumpSv(stochasticSValues);
        System.out.println("--SVD solver singular values:");
        LocalSSVDSolverSparseSequentialTest.dumpSv(svalues2);

        for (int i = 0; i < k + p; i++) {
            assertTrue(Math.abs(svalues2.getQuick(i) - stochasticSValues.getQuick(i)) <= s_epsilon);
        }

        DenseMatrix mQ = SSVDHelper.drmLoadAsDense(fs, new Path(svdOutPath, "Bt-job/" + BtJob.OUTPUT_Q + "-*"),
                conf);

        SSVDCommonTest.assertOrthonormality(mQ, false, s_epsilon);

        // assert name propagation
        for (Iterator<Pair<Writable, Vector>> iter = SSVDHelper.drmIterator(fs,
                new Path(ssvd.getuSigmaPath() + "/*"), conf, closeables); iter.hasNext();) {
            Pair<Writable, Vector> pair = iter.next();
            Writable key = pair.getFirst();
            Vector v = pair.getSecond();

            assertTrue(v instanceof NamedVector);
            assertTrue(key instanceof Text);
        }

    } finally {
        IOUtils.close(closeables);
    }
}

From source file:org.apache.mahout.math.hadoop.stochasticsvd.LocalSSVDSolverSparseSequentialTest.java

License:Apache License

public void runSSVDSolver(int q) throws IOException {

    Configuration conf = getConfiguration();
    conf.set("mapred.job.tracker", "local");
    conf.set("fs.default.name", "file:///");

    // conf.set("mapred.job.tracker","localhost:11011");
    // conf.set("fs.default.name","hdfs://localhost:11010/");

    Deque<Closeable> closeables = Lists.newLinkedList();
    ;/*  ww  w  .  j av a 2s  .  c om*/
    Random rnd = RandomUtils.getRandom();

    File tmpDir = getTestTempDir("svdtmp");
    conf.set("hadoop.tmp.dir", tmpDir.getAbsolutePath());

    Path aLocPath = new Path(getTestTempDirPath("svdtmp/A"), "A.seq");

    // create distributed row matrix-like struct
    SequenceFile.Writer w = SequenceFile.createWriter(FileSystem.getLocal(conf), conf, aLocPath,
            IntWritable.class, VectorWritable.class, CompressionType.BLOCK, new DefaultCodec());
    closeables.addFirst(w);

    int n = 100;
    int m = 2000;
    double percent = 5;

    VectorWritable vw = new VectorWritable();
    IntWritable roww = new IntWritable();

    double muAmplitude = 50.0;
    for (int i = 0; i < m; i++) {
        Vector dv = new SequentialAccessSparseVector(n);
        for (int j = 0; j < n * percent / 100; j++) {
            dv.setQuick(rnd.nextInt(n), muAmplitude * (rnd.nextDouble() - 0.5));
        }
        roww.set(i);
        vw.set(dv);
        w.append(roww, vw);
    }
    closeables.remove(w);
    Closeables.close(w, false);

    FileSystem fs = FileSystem.get(aLocPath.toUri(), conf);

    Path tempDirPath = getTestTempDirPath("svd-proc");
    Path aPath = new Path(tempDirPath, "A/A.seq");
    fs.copyFromLocalFile(aLocPath, aPath);

    Path svdOutPath = new Path(tempDirPath, "SSVD-out");

    // make sure we wipe out previous test results, just a convenience
    fs.delete(svdOutPath, true);

    // Solver starts here:
    System.out.println("Input prepared, starting solver...");

    int ablockRows = 867;
    int p = 60;
    int k = 40;
    SSVDSolver ssvd = new SSVDSolver(conf, new Path[] { aPath }, svdOutPath, ablockRows, k, p, 3);
    ssvd.setOuterBlockHeight(500);
    ssvd.setAbtBlockHeight(251);

    /*
     * removing V,U jobs from this test to reduce running time. i will keep them
     * put in the dense test though.
     */
    ssvd.setComputeU(false);
    ssvd.setComputeV(false);

    ssvd.setOverwrite(true);
    ssvd.setQ(q);
    ssvd.setBroadcast(true);
    ssvd.run();

    Vector stochasticSValues = ssvd.getSingularValues();
    System.out.println("--SSVD solver singular values:");
    dumpSv(stochasticSValues);
    System.out.println("--Colt SVD solver singular values:");

    // try to run the same thing without stochastic algo
    DenseMatrix a = SSVDHelper.drmLoadAsDense(fs, aPath, conf);

    // SingularValueDecompositionImpl svd=new SingularValueDecompositionImpl(new
    // Array2DRowRealMatrix(a));
    SingularValueDecomposition svd2 = new SingularValueDecomposition(a);

    Vector svalues2 = new DenseVector(svd2.getSingularValues());
    dumpSv(svalues2);

    for (int i = 0; i < k + p; i++) {
        assertTrue(Math.abs(svalues2.getQuick(i) - stochasticSValues.getQuick(i)) <= s_epsilon);
    }

    DenseMatrix mQ = SSVDHelper.drmLoadAsDense(fs, new Path(svdOutPath, "Bt-job/" + BtJob.OUTPUT_Q + "-*"),
            conf);

    SSVDCommonTest.assertOrthonormality(mQ, false, s_epsilon);

    IOUtils.close(closeables);
}

From source file:org.apache.nutch.parse.ParseOutputFormat.java

License:Apache License

public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress)
        throws IOException {

    if (job.getBoolean("parse.filter.urls", true)) {
        filters = new URLFilters(job);
    }/*w w w .ja v a  2  s. c  om*/

    if (job.getBoolean("parse.normalize.urls", true)) {
        normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
    }

    this.scfilters = new ScoringFilters(job);
    final int interval = job.getInt("db.fetch.interval.default", 2592000);
    final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false);
    int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100);
    final boolean isParsing = job.getBoolean("fetcher.parse", true);
    final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage;
    final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(job);
    Path out = FileOutputFormat.getOutputPath(job);

    Path text = new Path(new Path(out, ParseText.DIR_NAME), name);
    Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
    Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);

    final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb", "").split(" *, *");

    final MapFile.Writer textOut = new MapFile.Writer(job, fs, text.toString(), Text.class, ParseText.class,
            CompressionType.RECORD, progress);

    final MapFile.Writer dataOut = new MapFile.Writer(job, fs, data.toString(), Text.class, ParseData.class,
            compType, progress);

    final SequenceFile.Writer crawlOut = SequenceFile.createWriter(fs, job, crawl, Text.class, CrawlDatum.class,
            compType, progress);

    return new RecordWriter<Text, Parse>() {

        public void write(Text key, Parse parse) throws IOException {

            String fromUrl = key.toString();
            String fromHost = null;
            textOut.append(key, new ParseText(parse.getText()));

            ParseData parseData = parse.getData();
            // recover the signature prepared by Fetcher or ParseSegment
            String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
            if (sig != null) {
                byte[] signature = StringUtil.fromHexString(sig);
                if (signature != null) {
                    // append a CrawlDatum with a signature
                    CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
                    d.setSignature(signature);
                    crawlOut.append(key, d);
                }
            }

            // see if the parse metadata contain things that we'd like
            // to pass to the metadata of the crawlDB entry
            CrawlDatum parseMDCrawlDatum = null;
            for (String mdname : parseMDtoCrawlDB) {
                String mdvalue = parse.getData().getParseMeta().get(mdname);
                if (mdvalue != null) {
                    if (parseMDCrawlDatum == null)
                        parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META, 0);
                    parseMDCrawlDatum.getMetaData().put(new Text(mdname), new Text(mdvalue));
                }
            }
            if (parseMDCrawlDatum != null)
                crawlOut.append(key, parseMDCrawlDatum);

            try {
                ParseStatus pstatus = parseData.getStatus();
                if (pstatus != null && pstatus.isSuccess()
                        && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                    String newUrl = pstatus.getMessage();
                    int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);

                    try {
                        if (normalizers != null) {
                            newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
                        }
                    } catch (MalformedURLException mfue) {
                        newUrl = null;
                    }

                    if (filters != null) {
                        if (newUrl != null)
                            newUrl = filters.filter(newUrl);
                    }

                    String url = key.toString();
                    if (newUrl != null && !newUrl.equals(url)) {
                        String reprUrl = URLUtil.chooseRepr(url, newUrl,
                                refreshTime < Fetcher.PERM_REFRESH_TIME);
                        CrawlDatum newDatum = new CrawlDatum();
                        newDatum.setStatus(CrawlDatum.STATUS_LINKED);
                        if (reprUrl != null && !reprUrl.equals(newUrl)) {
                            newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
                        }
                        crawlOut.append(new Text(newUrl), newDatum);
                    }
                }
            } catch (URLFilterException e) {
                // ignore
            }

            // collect outlinks for subsequent db update
            Outlink[] links = parseData.getOutlinks();
            int outlinksToStore = Math.min(maxOutlinks, links.length);
            if (ignoreExternalLinks) {
                try {
                    fromHost = new URL(fromUrl).getHost().toLowerCase();
                } catch (MalformedURLException e) {
                    fromHost = null;
                }
            } else {
                fromHost = null;
            }

            int validCount = 0;
            CrawlDatum adjust = null;
            List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>(outlinksToStore);
            List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
            for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
                String toUrl = links[i].getToUrl();

                // Only normalize and filter if fetcher.parse = false
                if (!isParsing) {
                    toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks,
                            filters, normalizers);
                    if (toUrl == null) {
                        continue;
                    }
                }

                CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
                Text targetUrl = new Text(toUrl);
                try {
                    scfilters.initialScore(targetUrl, target);
                } catch (ScoringFilterException e) {
                    LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage());
                    target.setScore(0.0f);
                }

                targets.add(new SimpleEntry(targetUrl, target));

                // OVerwrite URL in Outlink object with normalized URL (NUTCH-1174)
                links[i].setUrl(toUrl);
                outlinkList.add(links[i]);
                validCount++;
            }

            try {
                // compute score contributions and adjustment to the original score
                adjust = scfilters.distributeScoreToOutlinks((Text) key, parseData, targets, null,
                        links.length);
            } catch (ScoringFilterException e) {
                LOG.warn("Cannot distribute score from " + key + ": " + e.getMessage());
            }
            for (Entry<Text, CrawlDatum> target : targets) {
                crawlOut.append(target.getKey(), target.getValue());
            }
            if (adjust != null)
                crawlOut.append(key, adjust);

            Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]);
            parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), filteredLinks,
                    parseData.getContentMeta(), parseData.getParseMeta());
            dataOut.append(key, parseData);
            if (!parse.isCanonical()) {
                CrawlDatum datum = new CrawlDatum();
                datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
                String timeString = parse.getData().getContentMeta().get(Nutch.FETCH_TIME_KEY);
                try {
                    datum.setFetchTime(Long.parseLong(timeString));
                } catch (Exception e) {
                    LOG.warn("Can't read fetch time for: " + key);
                    datum.setFetchTime(System.currentTimeMillis());
                }
                crawlOut.append(key, datum);
            }
        }

        public void close(Reporter reporter) throws IOException {
            textOut.close();
            dataOut.close();
            crawlOut.close();
        }

    };

}

From source file:org.apache.sqoop.connector.hdfs.hdfsWriter.HdfsSequenceWriter.java

License:Apache License

@SuppressWarnings("deprecation")
public void initialize(Path filepath, Configuration conf, CompressionCodec codec) throws IOException {
    if (codec != null) {
        filewriter = SequenceFile.createWriter(filepath.getFileSystem(conf), conf, filepath, Text.class,
                NullWritable.class, SequenceFile.CompressionType.BLOCK, codec);
    } else {// w  w  w . j  ava 2s  . c  om
        filewriter = SequenceFile.createWriter(filepath.getFileSystem(conf), conf, filepath, Text.class,
                NullWritable.class, SequenceFile.CompressionType.NONE);
    }

    text = new Text();
}