Example usage for org.apache.hadoop.mapred Reporter getCounter

List of usage examples for org.apache.hadoop.mapred Reporter getCounter

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred Reporter getCounter.

Prototype

public abstract Counter getCounter(String group, String name);

Source Link

Document

Get the Counter of the given group with the given name.

Usage

From source file:co.nubetech.hiho.mapred.MySQLLoadDataMapper.java

License:Apache License

@Override
public void map(Text key, FSDataInputStream val, OutputCollector<NullWritable, NullWritable> collector,
        Reporter reporter) throws IOException {

    conn = getConnection();/*from   ww w  .j  a v a2 s.  c o m*/
    com.mysql.jdbc.Statement stmt = null;
    String query;

    String[] columnNames = null;
    if (hasHeaderLine) {
        BufferedReader headerReader = new BufferedReader(new InputStreamReader(val));
        String header = headerReader.readLine();
        if (header == null)
            return;
        columnNames = header.split(",");
        val.seek(header.getBytes(utf8).length + newline.length);
    }
    try {

        stmt = (com.mysql.jdbc.Statement) conn.createStatement(ResultSet.TYPE_SCROLL_SENSITIVE,
                ResultSet.CONCUR_UPDATABLE);
        String tablename = (keyIsTableName ? keyToTablename(key) : "");
        if (disableKeys && !tablename.equals("")) {
            reporter.setStatus("Disabling keys on " + tablename);
            stmt.execute("ALTER TABLE " + tablename + " DISABLE KEYS");
        }
        stmt.setLocalInfileInputStream(val);
        query = "load data local infile 'abc.txt' into table " + tablename + " ";
        query += querySuffix;
        if (hasHeaderLine)
            query += " (" + StringUtils.join(columnNames, ",") + ")";
        reporter.setStatus("Inserting into " + tablename);
        logger.debug("stmt: " + query);
        int rows = stmt.executeUpdate(query);
        logger.debug(rows + " rows updated");
        if (disableKeys && !tablename.equals("")) {
            reporter.setStatus("Re-enabling keys on " + tablename);
            stmt.execute("ALTER TABLE " + tablename + " ENABLE KEYS");
        }
        if (!tablename.equals(""))
            reporter.getCounter("MySQLLoadCounters", "ROWS_INSERTED_TABLE_" + tablename).increment(rows);
        reporter.getCounter("MySQLLoadCounters", "ROWS_INSERTED_TOTAL").increment(rows);

    } catch (Exception e) {
        e.printStackTrace();
        stmt = null;
        throw new IOException(e);
    } finally {
        try {
            if (stmt != null) {
                stmt.close();
            }
        } catch (SQLException s) {
            s.printStackTrace();
        }
    }
}

From source file:com.cloudera.recordservice.mapred.RecordServiceInputFormatBase.java

License:Apache License

/**
 * Populates RecordService counters in ctx from counters.
 */// w ww.  j  ava2  s  .co m
public static void setCounters(Reporter ctx, TaskStatus.Stats counters) {
    if (ctx == null)
        return;
    ctx.getCounter(COUNTERS_GROUP_NAME, "Records Read").setValue(counters.numRecordsRead);
    ctx.getCounter(COUNTERS_GROUP_NAME, "Records Returned").setValue(counters.numRecordsReturned);
    ctx.getCounter(COUNTERS_GROUP_NAME, "Record Serialization Time(ms)").setValue(counters.serializeTimeMs);
    ctx.getCounter(COUNTERS_GROUP_NAME, "Client Time(ms)").setValue(counters.clientTimeMs);

    if (counters.hdfsCountersSet) {
        ctx.getCounter(COUNTERS_GROUP_NAME, "Bytes Read").setValue(counters.bytesRead);
        ctx.getCounter(COUNTERS_GROUP_NAME, "Decompression Time(ms)").setValue(counters.decompressTimeMs);
        ctx.getCounter(COUNTERS_GROUP_NAME, "Bytes Read Local").setValue(counters.bytesReadLocal);
        ctx.getCounter(COUNTERS_GROUP_NAME, "HDFS Throughput(MB/s)")
                .setValue((long) (counters.hdfsThroughput / (1024 * 1024)));
    }
}

From source file:com.digitalpebble.behemoth.languageidentification.LanguageIdProcessor.java

License:Apache License

public BehemothDocument[] process(BehemothDocument inputDoc, Reporter reporter) {
    // check that it has some text
    if (inputDoc.getText() == null) {
        LOG.info("No text for " + inputDoc.getUrl() + " skipping");
        reporter.getCounter("LANGUAGE ID", "MISSING TEXT").increment(1);
        return new BehemothDocument[] { inputDoc };
    }/*  w ww. j  a  v a2 s . co m*/

    String lang = null;

    // skip docs with empty text
    if (inputDoc.getText().trim().isEmpty()) {
        LOG.info("Empty text for " + inputDoc.getUrl() + " skipping");
        reporter.getCounter("LANGUAGE ID", "EMPTY TEXT").increment(1);
        return new BehemothDocument[] { inputDoc };
    }

    try {
        Detector detector = DetectorFactory.create();
        detector.append(inputDoc.getText());
        lang = detector.detect();
        inputDoc.getMetadata(true).put(languageMDKey, new Text(lang));
    } catch (LangDetectException e) {
        LOG.error("Exception on doc " + inputDoc.getUrl(), e);
        lang = null;
    }

    if (reporter != null && lang != null)
        reporter.getCounter("LANGUAGE DETECTED", lang).increment(1);

    return new BehemothDocument[] { inputDoc };
}

From source file:com.digitalpebble.behemoth.tika.TikaProcessor.java

License:Apache License

/**
 * Process a BehemothDocument with Tika/*from  ww  w.  j a  v  a  2  s .co m*/
 * 
 * @return an array of documents or null if an exception is encountered
 */
public BehemothDocument[] process(BehemothDocument inputDoc, Reporter reporter) {
    // check that it has some text or content
    if (inputDoc.getContent() == null && inputDoc.getText() == null) {
        LOG.info("No content or text for " + inputDoc.getUrl() + " skipping");
        if (reporter != null)
            reporter.getCounter("TIKA", "NO CONTENT OR TEXT").increment(1);
        return new BehemothDocument[] { inputDoc };
    }

    // determine the content type if missing
    if (inputDoc.getContentType() == null || inputDoc.getContentType().equals("") == true) {
        String mt = null;
        // using the original content
        if (mimeType == null | forceMTDetection) {
            if (inputDoc.getContent() != null) {
                Metadata meta = new Metadata();
                meta.set(Metadata.RESOURCE_NAME_KEY, inputDoc.getUrl());
                MimeType mimetype = null;
                try {
                    MediaType mediaType = detector.detect(new ByteArrayInputStream(inputDoc.getContent()),
                            meta);
                    mimetype = mimetypes.forName(mediaType.getType() + "/" + mediaType.getSubtype());
                } catch (IOException e) {
                    LOG.error("Exception", e);
                } catch (MimeTypeException e) {
                    LOG.error("Exception", e);
                }
                mt = mimetype.getName();
            } else if (mimeType == null && inputDoc.getText() != null) {
                // force it to text
                mt = "text/plain";
            }
        } else {
            mt = mimeType;// allow outside user to specify a mime type if
            // they know all the content, saves time and
            // reduces error
        }
        if (mt != null) {
            inputDoc.setContentType(mt);
        }
    }

    // determine which parser to use
    Parser parser = TikaConfig.getDefaultConfig().getParser();

    // skip the processing if the input document already has some text
    if (inputDoc.getText() != null) {
        if (reporter != null)
            reporter.getCounter("TIKA", "TEXT ALREADY AVAILABLE").increment(1);
        return new BehemothDocument[] { inputDoc };
    }

    // filter based on content length
    // optional
    int length = inputDoc.getContent().length;
    if (contentLengthThresholdFilter != -1 && length > contentLengthThresholdFilter) {
        if (reporter != null)
            reporter.getCounter("TIKA", "FILTERED-CONTENT-LENGTH").increment(1);
        return new BehemothDocument[] { inputDoc };
    }

    // otherwise parse the document and retrieve the text, metadata and
    // markup annotations

    InputStream is = new ByteArrayInputStream(inputDoc.getContent());

    Metadata metadata = new Metadata();
    // put the mimetype in the metadata so that Tika can
    // decide which parser to use
    metadata.set(Metadata.CONTENT_TYPE, inputDoc.getContentType());

    String ct = inputDoc.getContentType();
    try {
        if (reporter != null && okCounters)
            reporter.getCounter("MIME-TYPE", ct).increment(1);
    } catch (Exception counterEx) {
        LOG.error("Could not add counter MIME-TYPE:" + ct, counterEx);
        okCounters = false;
    }

    // TODO check config whether want the markup or just the text and
    // metadata?
    BehemothHandler handler = new TikaMarkupHandler();

    boolean doMarkup = config.getBoolean("tika.convert.markup", true);

    if (!doMarkup) {
        handler = new TikaTextHandler();
    }

    ParseContext context = new ParseContext();

    // TODO generalise the approach so that can set any class via context
    String customMapper = config.get("tika.context.HtmlMapper.class");
    if (customMapper != null) {
        try {
            Class<HtmlMapper> customMapperClass = (Class<HtmlMapper>) Class.forName(customMapper);
            // specify a custom HTML mapper via the Context
            context.set(HtmlMapper.class, customMapperClass.newInstance());
        } catch (Exception e) {
            LOG.error("Can't use class " + customMapper + " for HtmlMapper, using default");
        }
    }

    try {
        parser.parse(is, handler, metadata, context);
        processMetadata(inputDoc, metadata);
        processText(inputDoc, handler.getText());
        processMarkupAnnotations(inputDoc, handler.getAnnotations());
        if (reporter != null)
            reporter.getCounter("TIKA", "ANNOTATIONS ADDED").increment(handler.getAnnotations().size());
    } catch (Exception e) {
        LOG.error(inputDoc.getUrl().toString(), e);
        if (reporter != null)
            reporter.getCounter("TIKA", "PARSING_ERROR").increment(1);
        return new BehemothDocument[] { inputDoc };
    } finally {
        try {
            is.close();
        } catch (IOException e) {
        }
    }

    // TODO if the content type is an archive maybe process and return
    // all the subdocuments
    if (reporter != null)
        reporter.getCounter("TIKA", "DOC-PARSED").increment(1);

    return new BehemothDocument[] { inputDoc };
}

From source file:com.ebay.erl.mobius.core.mapred.DefaultMobiusReducer.java

License:Apache License

private void output(Tuple aTuple, OutputCollector<NullWritable, WritableComparable<?>> output,
        Reporter reporter) throws IOException {
    aTuple.setToStringOrdering(this.outputColumnNames);
    if (this._persistantCriteria != null) {
        if (this._persistantCriteria.accept(aTuple, this.conf)) {
            output.collect(NullWritable.get(), aTuple);
            reporter.getCounter("Join/Grouping Records", "EMITTED").increment(1);
        } else {/*from ww w .j  a v a 2s  . c o  m*/
            reporter.getCounter("Join/Grouping Records", "FILTERED").increment(1);
        }
    } else {
        output.collect(NullWritable.get(), aTuple);
        reporter.getCounter("Join/Grouping Records", "EMITTED").increment(1);
    }
}

From source file:com.TCG.Nutch_DNS.HostDbReducer.java

License:Apache License

public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output,
        Reporter reporter) throws IOException {

    CrawlDatum fetch = new CrawlDatum();
    CrawlDatum old = new CrawlDatum();

    boolean fetchSet = false;
    boolean oldSet = false;
    byte[] signature = null;
    boolean multiple = false; // avoid deep copy when only single value exists
    linked.clear();//from   ww w . j  ava2  s. c  o  m
    org.apache.hadoop.io.MapWritable metaFromParse = null;

    while (values.hasNext()) {
        CrawlDatum datum = values.next();
        if (!multiple && values.hasNext())
            multiple = true;
        if (CrawlDatum.hasDbStatus(datum)) {
            if (!oldSet) {
                if (multiple) {
                    old.set(datum);
                } else {
                    // no need for a deep copy - this is the only value
                    old = datum;
                }
                oldSet = true;
            } else {
                // always take the latest version
                if (old.getFetchTime() < datum.getFetchTime())
                    old.set(datum);
            }
            continue;
        }

        if (CrawlDatum.hasFetchStatus(datum)) {
            if (!fetchSet) {
                if (multiple) {
                    fetch.set(datum);
                } else {
                    fetch = datum;
                }
                fetchSet = true;
            } else {
                // always take the latest version
                if (fetch.getFetchTime() < datum.getFetchTime())
                    fetch.set(datum);
            }
            continue;
        }

        switch (datum.getStatus()) { // collect other info
        case CrawlDatum.STATUS_LINKED:
            CrawlDatum link;
            if (multiple) {
                link = new CrawlDatum();
                link.set(datum);
            } else {
                link = datum;
            }
            linked.insert(link);
            break;
        case CrawlDatum.STATUS_SIGNATURE:
            signature = datum.getSignature();
            break;
        case CrawlDatum.STATUS_PARSE_META:
            metaFromParse = datum.getMetaData();
            break;
        default:
            LOG.warn("Unknown status, key: " + key + ", datum: " + datum);
        }
    }

    // copy the content of the queue into a List
    // in reversed order
    int numLinks = linked.size();
    List<CrawlDatum> linkList = new ArrayList<CrawlDatum>(numLinks);
    for (int i = numLinks - 1; i >= 0; i--) {
        linkList.add(linked.pop());
    }

    // if it doesn't already exist, skip it
    if (!oldSet && !additionsAllowed)
        return;

    // if there is no fetched datum, perhaps there is a link
    if (!fetchSet && linkList.size() > 0) {
        fetch = linkList.get(0);
        fetchSet = true;
    }

    // still no new data - record only unchanged old data, if exists, and return
    if (!fetchSet) {
        if (oldSet) {// at this point at least "old" should be present
            output.collect(key, old);
            reporter.getCounter("CrawlDB status", CrawlDatum.getStatusName(old.getStatus())).increment(1);
        } else {
            LOG.warn("Missing fetch and old value, signature=" + signature);
        }
        return;
    }

    if (signature == null)
        signature = fetch.getSignature();
    long prevModifiedTime = oldSet ? old.getModifiedTime() : 0L;
    long prevFetchTime = oldSet ? old.getFetchTime() : 0L;

    // initialize with the latest version, be it fetch or link
    result.set(fetch);
    if (oldSet) {
        // copy metadata from old, if exists
        if (old.getMetaData().size() > 0) {
            result.putAllMetaData(old);
            // overlay with new, if any
            if (fetch.getMetaData().size() > 0)
                result.putAllMetaData(fetch);
        }
        // set the most recent valid value of modifiedTime
        if (old.getModifiedTime() > 0 && fetch.getModifiedTime() == 0) {
            result.setModifiedTime(old.getModifiedTime());
        }
    }

    switch (fetch.getStatus()) { // determine new status

    case CrawlDatum.STATUS_LINKED: // it was link
        if (oldSet) { // if old exists
            result.set(old); // use it
        } else {
            result = schedule.initializeSchedule(key, result);
            result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
            try {
                scfilters.initialScore(key, result);
            } catch (ScoringFilterException e) {
                if (LOG.isWarnEnabled()) {
                    LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage());
                }
                result.setScore(0.0f);
            }
        }
        break;

    case CrawlDatum.STATUS_FETCH_SUCCESS: // succesful fetch
    case CrawlDatum.STATUS_FETCH_REDIR_TEMP: // successful fetch, redirected
    case CrawlDatum.STATUS_FETCH_REDIR_PERM:
    case CrawlDatum.STATUS_FETCH_NOTMODIFIED: // successful fetch, notmodified
        // determine the modification status
        int modified = FetchSchedule.STATUS_UNKNOWN;
        if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
            modified = FetchSchedule.STATUS_NOTMODIFIED;
        } else if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
            // only successful fetches (but not redirects, NUTCH-1422)
            // are detected as "not modified" by signature comparison
            if (oldSet && old.getSignature() != null && signature != null) {
                if (SignatureComparator._compare(old.getSignature(), signature) != 0) {
                    modified = FetchSchedule.STATUS_MODIFIED;
                } else {
                    modified = FetchSchedule.STATUS_NOTMODIFIED;
                }
            }
        }
        // set the schedule
        result = schedule.setFetchSchedule(key, result, prevFetchTime, prevModifiedTime, fetch.getFetchTime(),
                fetch.getModifiedTime(), modified);
        // set the result status and signature
        if (modified == FetchSchedule.STATUS_NOTMODIFIED) {
            result.setStatus(CrawlDatum.STATUS_DB_NOTMODIFIED);

            // NUTCH-1341 The page is not modified according to its signature, let's
            // reset lastModified as well
            result.setModifiedTime(prevModifiedTime);

            if (oldSet)
                result.setSignature(old.getSignature());
        } else {
            switch (fetch.getStatus()) {
            case CrawlDatum.STATUS_FETCH_SUCCESS:
                result.setStatus(CrawlDatum.STATUS_DB_FETCHED);
                break;
            case CrawlDatum.STATUS_FETCH_REDIR_PERM:
                result.setStatus(CrawlDatum.STATUS_DB_REDIR_PERM);
                break;
            case CrawlDatum.STATUS_FETCH_REDIR_TEMP:
                result.setStatus(CrawlDatum.STATUS_DB_REDIR_TEMP);
                break;
            default:
                LOG.warn("Unexpected status: " + fetch.getStatus() + " resetting to old status.");
                if (oldSet)
                    result.setStatus(old.getStatus());
                else
                    result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
            }
            result.setSignature(signature);
        }

        // https://issues.apache.org/jira/browse/NUTCH-1656
        if (metaFromParse != null) {
            for (Entry<Writable, Writable> e : metaFromParse.entrySet()) {
                result.getMetaData().put(e.getKey(), e.getValue());
            }
        }

        // if fetchInterval is larger than the system-wide maximum, trigger
        // an unconditional recrawl. This prevents the page to be stuck at
        // NOTMODIFIED state, when the old fetched copy was already removed with
        // old segments.
        if (maxInterval < result.getFetchInterval())
            result = schedule.forceRefetch(key, result, false);
        break;
    case CrawlDatum.STATUS_SIGNATURE:
        if (LOG.isWarnEnabled()) {
            LOG.warn("Lone CrawlDatum.STATUS_SIGNATURE: " + key);
        }
        return;
    case CrawlDatum.STATUS_FETCH_RETRY: // temporary failure
        if (oldSet) {
            result.setSignature(old.getSignature()); // use old signature
        }
        result = schedule.setPageRetrySchedule(key, result, prevFetchTime, prevModifiedTime,
                fetch.getFetchTime());
        if (result.getRetriesSinceFetch() < retryMax) {
            result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
        } else {
            result.setStatus(CrawlDatum.STATUS_DB_GONE);
            result = schedule.setPageGoneSchedule(key, result, prevFetchTime, prevModifiedTime,
                    fetch.getFetchTime());
        }
        break;

    case CrawlDatum.STATUS_FETCH_GONE: // permanent failure
        if (oldSet)
            result.setSignature(old.getSignature()); // use old signature
        result.setStatus(CrawlDatum.STATUS_DB_GONE);
        result = schedule.setPageGoneSchedule(key, result, prevFetchTime, prevModifiedTime,
                fetch.getFetchTime());
        break;

    default:
        throw new RuntimeException("Unknown status: " + fetch.getStatus() + " " + key);
    }

    try {
        scfilters.updateDbScore(key, oldSet ? old : null, result, linkList);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't update score, key=" + key + ": " + e);
        }
    }
    // remove generation time, if any
    result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY);
    output.collect(key, result);
    reporter.getCounter("CrawlDB status", CrawlDatum.getStatusName(result.getStatus())).increment(1);
}

From source file:gobblin.metrics.hadoop.HadoopCounterReporterTest.java

License:Apache License

@BeforeClass
public void setUp() throws Exception {

    String contextName = CONTEXT_NAME + "_" + UUID.randomUUID().toString();

    Reporter mockedReporter = Mockito.mock(Reporter.class);

    this.recordsProcessedCount = Mockito.mock(Counters.Counter.class);
    Mockito.when(mockedReporter.getCounter(contextName,
            MetricRegistry.name(RECORDS_PROCESSED, Measurements.COUNT.getName())))
            .thenReturn(this.recordsProcessedCount);

    this.recordProcessRateCount = Mockito.mock(Counters.Counter.class);
    Mockito.when(mockedReporter.getCounter(contextName,
            MetricRegistry.name(RECORD_PROCESS_RATE, Measurements.COUNT.getName())))
            .thenReturn(this.recordProcessRateCount);

    this.recordSizeDistributionCount = Mockito.mock(Counters.Counter.class);
    Mockito.when(mockedReporter.getCounter(contextName,
            MetricRegistry.name(RECORD_SIZE_DISTRIBUTION, Measurements.COUNT.getName())))
            .thenReturn(this.recordSizeDistributionCount);

    this.totalDurationCount = Mockito.mock(Counters.Counter.class);
    Mockito.when(mockedReporter.getCounter(contextName,
            MetricRegistry.name(TOTAL_DURATION, Measurements.COUNT.getName())))
            .thenReturn(this.totalDurationCount);

    this.queueSize = Mockito.mock(Counters.Counter.class);
    Mockito.when(mockedReporter.getCounter(contextName, QUEUE_SIZE)).thenReturn(this.queueSize);

    this.hadoopCounterReporter = HadoopCounterReporter.builder(mockedReporter).convertRatesTo(TimeUnit.SECONDS)
            .convertDurationsTo(TimeUnit.SECONDS).filter(MetricFilter.ALL)
            .build(MetricContext.builder(contextName).buildStrict());
}

From source file:hivemall.fm.FactorizationMachineUDTF.java

License:Apache License

protected void runTrainingIteration(int iterations) throws HiveException {
    final ByteBuffer inputBuf = this._inputBuf;
    final NioStatefullSegment fileIO = this._fileIO;
    assert (inputBuf != null);
    assert (fileIO != null);
    final long numTrainingExamples = _t;
    final boolean adaregr = _va_rand != null;

    final Reporter reporter = getReporter();
    final Counter iterCounter = (reporter == null) ? null
            : reporter.getCounter("hivemall.fm.FactorizationMachines$Counter", "iteration");

    try {/*from   w w  w  .ja  v a  2s . c om*/
        if (fileIO.getPosition() == 0L) {// run iterations w/o temporary file
            if (inputBuf.position() == 0) {
                return; // no training example
            }
            inputBuf.flip();

            int iter = 2;
            for (; iter <= iterations; iter++) {
                reportProgress(reporter);
                setCounterValue(iterCounter, iter);

                while (inputBuf.remaining() > 0) {
                    int bytes = inputBuf.getInt();
                    assert (bytes > 0) : bytes;
                    int xLength = inputBuf.getInt();
                    final Feature[] x = new Feature[xLength];
                    for (int j = 0; j < xLength; j++) {
                        x[j] = instantiateFeature(inputBuf);
                    }
                    double y = inputBuf.getDouble();
                    // invoke train
                    ++_t;
                    train(x, y, adaregr);
                }
                if (_cvState.isConverged(iter, numTrainingExamples)) {
                    break;
                }
                inputBuf.rewind();
            }
            LOG.info("Performed " + Math.min(iter, iterations) + " iterations of "
                    + NumberUtils.formatNumber(numTrainingExamples) + " training examples on memory (thus "
                    + NumberUtils.formatNumber(_t) + " training updates in total) ");
        } else {// read training examples in the temporary file and invoke train for each example

            // write training examples in buffer to a temporary file
            if (inputBuf.remaining() > 0) {
                writeBuffer(inputBuf, fileIO);
            }
            try {
                fileIO.flush();
            } catch (IOException e) {
                throw new HiveException("Failed to flush a file: " + fileIO.getFile().getAbsolutePath(), e);
            }
            if (LOG.isInfoEnabled()) {
                File tmpFile = fileIO.getFile();
                LOG.info(
                        "Wrote " + numTrainingExamples + " records to a temporary file for iterative training: "
                                + tmpFile.getAbsolutePath() + " (" + FileUtils.prettyFileSize(tmpFile) + ")");
            }

            // run iterations
            int iter = 2;
            for (; iter <= iterations; iter++) {
                setCounterValue(iterCounter, iter);

                inputBuf.clear();
                fileIO.resetPosition();
                while (true) {
                    reportProgress(reporter);
                    // TODO prefetch
                    // writes training examples to a buffer in the temporary file
                    final int bytesRead;
                    try {
                        bytesRead = fileIO.read(inputBuf);
                    } catch (IOException e) {
                        throw new HiveException("Failed to read a file: " + fileIO.getFile().getAbsolutePath(),
                                e);
                    }
                    if (bytesRead == 0) { // reached file EOF
                        break;
                    }
                    assert (bytesRead > 0) : bytesRead;

                    // reads training examples from a buffer
                    inputBuf.flip();
                    int remain = inputBuf.remaining();
                    if (remain < INT_BYTES) {
                        throw new HiveException("Illegal file format was detected");
                    }
                    while (remain >= INT_BYTES) {
                        int pos = inputBuf.position();
                        int recordBytes = inputBuf.getInt();
                        remain -= INT_BYTES;
                        if (remain < recordBytes) {
                            inputBuf.position(pos);
                            break;
                        }

                        final int xLength = inputBuf.getInt();
                        final Feature[] x = new Feature[xLength];
                        for (int j = 0; j < xLength; j++) {
                            x[j] = instantiateFeature(inputBuf);
                        }
                        double y = inputBuf.getDouble();

                        // invoke training
                        ++_t;
                        train(x, y, adaregr);

                        remain -= recordBytes;
                    }
                    inputBuf.compact();
                }
                if (_cvState.isConverged(iter, numTrainingExamples)) {
                    break;
                }
            }
            LOG.info("Performed " + Math.min(iter, iterations) + " iterations of "
                    + NumberUtils.formatNumber(numTrainingExamples)
                    + " training examples on a secondary storage (thus " + NumberUtils.formatNumber(_t)
                    + " training updates in total)");
        }
    } finally {
        // delete the temporary file and release resources
        try {
            fileIO.close(true);
        } catch (IOException e) {
            throw new HiveException("Failed to close a file: " + fileIO.getFile().getAbsolutePath(), e);
        }
        this._inputBuf = null;
        this._fileIO = null;
    }
}

From source file:hivemall.GeneralLearnerBaseUDTF.java

License:Apache License

protected final void runIterativeTraining(@Nonnegative final int iterations) throws HiveException {
    final ByteBuffer buf = this.inputBuf;
    final NioStatefulSegment dst = this.fileIO;
    assert (buf != null);
    assert (dst != null);
    final long numTrainingExamples = count;

    final Reporter reporter = getReporter();
    final Counters.Counter iterCounter = (reporter == null) ? null
            : reporter.getCounter("hivemall.GeneralLearnerBase$Counter", "iteration");

    try {/*www.ja  v  a  2s .co  m*/
        if (dst.getPosition() == 0L) {// run iterations w/o temporary file
            if (buf.position() == 0) {
                return; // no training example
            }
            buf.flip();

            for (int iter = 2; iter <= iterations; iter++) {
                cvState.next();
                reportProgress(reporter);
                setCounterValue(iterCounter, iter);

                while (buf.remaining() > 0) {
                    int recordBytes = buf.getInt();
                    assert (recordBytes > 0) : recordBytes;
                    int featureVectorLength = buf.getInt();
                    final FeatureValue[] featureVector = new FeatureValue[featureVectorLength];
                    for (int j = 0; j < featureVectorLength; j++) {
                        featureVector[j] = readFeatureValue(buf, featureType);
                    }
                    float target = buf.getFloat();
                    train(featureVector, target);
                }
                buf.rewind();

                if (is_mini_batch) { // Update model with accumulated delta
                    batchUpdate();
                }

                if (cvState.isConverged(numTrainingExamples)) {
                    break;
                }
            }
            logger.info("Performed " + cvState.getCurrentIteration() + " iterations of "
                    + NumberUtils.formatNumber(numTrainingExamples) + " training examples on memory (thus "
                    + NumberUtils.formatNumber(numTrainingExamples * cvState.getCurrentIteration())
                    + " training updates in total) ");
        } else {// read training examples in the temporary file and invoke train for each example
            // write training examples in buffer to a temporary file
            if (buf.remaining() > 0) {
                writeBuffer(buf, dst);
            }
            try {
                dst.flush();
            } catch (IOException e) {
                throw new HiveException("Failed to flush a file: " + dst.getFile().getAbsolutePath(), e);
            }
            if (logger.isInfoEnabled()) {
                File tmpFile = dst.getFile();
                logger.info(
                        "Wrote " + numTrainingExamples + " records to a temporary file for iterative training: "
                                + tmpFile.getAbsolutePath() + " (" + FileUtils.prettyFileSize(tmpFile) + ")");
            }

            // run iterations
            for (int iter = 2; iter <= iterations; iter++) {
                cvState.next();
                setCounterValue(iterCounter, iter);

                buf.clear();
                dst.resetPosition();
                while (true) {
                    reportProgress(reporter);
                    // TODO prefetch
                    // writes training examples to a buffer in the temporary file
                    final int bytesRead;
                    try {
                        bytesRead = dst.read(buf);
                    } catch (IOException e) {
                        throw new HiveException("Failed to read a file: " + dst.getFile().getAbsolutePath(), e);
                    }
                    if (bytesRead == 0) { // reached file EOF
                        break;
                    }
                    assert (bytesRead > 0) : bytesRead;

                    // reads training examples from a buffer
                    buf.flip();
                    int remain = buf.remaining();
                    if (remain < SizeOf.INT) {
                        throw new HiveException("Illegal file format was detected");
                    }
                    while (remain >= SizeOf.INT) {
                        int pos = buf.position();
                        int recordBytes = buf.getInt();
                        remain -= SizeOf.INT;

                        if (remain < recordBytes) {
                            buf.position(pos);
                            break;
                        }

                        int featureVectorLength = buf.getInt();
                        final FeatureValue[] featureVector = new FeatureValue[featureVectorLength];
                        for (int j = 0; j < featureVectorLength; j++) {
                            featureVector[j] = readFeatureValue(buf, featureType);
                        }
                        float target = buf.getFloat();
                        train(featureVector, target);

                        remain -= recordBytes;
                    }
                    buf.compact();
                }

                if (is_mini_batch) { // Update model with accumulated delta
                    batchUpdate();
                }

                if (cvState.isConverged(numTrainingExamples)) {
                    break;
                }
            }
            logger.info("Performed " + cvState.getCurrentIteration() + " iterations of "
                    + NumberUtils.formatNumber(numTrainingExamples)
                    + " training examples on a secondary storage (thus "
                    + NumberUtils.formatNumber(numTrainingExamples * cvState.getCurrentIteration())
                    + " training updates in total)");
        }
    } catch (Throwable e) {
        throw new HiveException("Exception caused in the iterative training", e);
    } finally {
        // delete the temporary file and release resources
        try {
            dst.close(true);
        } catch (IOException e) {
            throw new HiveException("Failed to close a file: " + dst.getFile().getAbsolutePath(), e);
        }
        this.inputBuf = null;
        this.fileIO = null;
    }
}

From source file:hivemall.mf.BPRMatrixFactorizationUDTF.java

License:Apache License

private final void runIterativeTraining(@Nonnegative final int iterations) throws HiveException {
    final ByteBuffer inputBuf = this.inputBuf;
    final NioFixedSegment fileIO = this.fileIO;
    assert (inputBuf != null);
    assert (fileIO != null);
    final long numTrainingExamples = count;

    final Reporter reporter = getReporter();
    final Counter iterCounter = (reporter == null) ? null
            : reporter.getCounter("hivemall.mf.BPRMatrixFactorization$Counter", "iteration");

    try {/*  w  w  w . j a  v a2s  .c  o  m*/
        if (lastWritePos == 0) {// run iterations w/o temporary file
            if (inputBuf.position() == 0) {
                return; // no training example
            }
            inputBuf.flip();

            int iter = 2;
            for (; iter <= iterations; iter++) {
                reportProgress(reporter);
                setCounterValue(iterCounter, iter);

                while (inputBuf.remaining() > 0) {
                    int u = inputBuf.getInt();
                    int i = inputBuf.getInt();
                    int j = inputBuf.getInt();
                    // invoke train
                    count++;
                    train(u, i, j);
                }
                cvState.multiplyLoss(0.5d);
                cvState.logState(iter, eta());
                if (cvState.isConverged(iter, numTrainingExamples)) {
                    break;
                }
                if (cvState.isLossIncreased()) {
                    etaEstimator.update(1.1f);
                } else {
                    etaEstimator.update(0.5f);
                }
                inputBuf.rewind();
            }
            LOG.info("Performed " + Math.min(iter, iterations) + " iterations of "
                    + NumberUtils.formatNumber(numTrainingExamples) + " training examples on memory (thus "
                    + NumberUtils.formatNumber(count) + " training updates in total) ");
        } else {// read training examples in the temporary file and invoke train for each example

            // write training examples in buffer to a temporary file
            if (inputBuf.position() > 0) {
                writeBuffer(inputBuf, fileIO, lastWritePos);
            } else if (lastWritePos == 0) {
                return; // no training example
            }
            try {
                fileIO.flush();
            } catch (IOException e) {
                throw new HiveException("Failed to flush a file: " + fileIO.getFile().getAbsolutePath(), e);
            }
            if (LOG.isInfoEnabled()) {
                File tmpFile = fileIO.getFile();
                LOG.info(
                        "Wrote " + numTrainingExamples + " records to a temporary file for iterative training: "
                                + tmpFile.getAbsolutePath() + " (" + FileUtils.prettyFileSize(tmpFile) + ")");
            }

            // run iterations
            int iter = 2;
            for (; iter <= iterations; iter++) {
                setCounterValue(iterCounter, iter);

                inputBuf.clear();
                long seekPos = 0L;
                while (true) {
                    reportProgress(reporter);
                    // TODO prefetch
                    // writes training examples to a buffer in the temporary file
                    final int bytesRead;
                    try {
                        bytesRead = fileIO.read(seekPos, inputBuf);
                    } catch (IOException e) {
                        throw new HiveException("Failed to read a file: " + fileIO.getFile().getAbsolutePath(),
                                e);
                    }
                    if (bytesRead == 0) { // reached file EOF
                        break;
                    }
                    assert (bytesRead > 0) : bytesRead;
                    seekPos += bytesRead;

                    // reads training examples from a buffer
                    inputBuf.flip();
                    int remain = inputBuf.remaining();
                    assert (remain > 0) : remain;
                    for (; remain >= RECORD_BYTES; remain -= RECORD_BYTES) {
                        int u = inputBuf.getInt();
                        int i = inputBuf.getInt();
                        int j = inputBuf.getInt();
                        // invoke train
                        count++;
                        train(u, i, j);
                    }
                    inputBuf.compact();
                }
                cvState.multiplyLoss(0.5d);
                cvState.logState(iter, eta());
                if (cvState.isConverged(iter, numTrainingExamples)) {
                    break;
                }
                if (cvState.isLossIncreased()) {
                    etaEstimator.update(1.1f);
                } else {
                    etaEstimator.update(0.5f);
                }
            }
            LOG.info("Performed " + Math.min(iter, iterations) + " iterations of "
                    + NumberUtils.formatNumber(numTrainingExamples)
                    + " training examples using a secondary storage (thus " + NumberUtils.formatNumber(count)
                    + " training updates in total)");
        }
    } finally {
        // delete the temporary file and release resources
        try {
            fileIO.close(true);
        } catch (IOException e) {
            throw new HiveException("Failed to close a file: " + fileIO.getFile().getAbsolutePath(), e);
        }
        this.inputBuf = null;
        this.fileIO = null;
    }
}