Example usage for org.apache.mahout.cf.taste.model IDMigrator toLongID

List of usage examples for org.apache.mahout.cf.taste.model IDMigrator toLongID

Introduction

In this page you can find the example usage for org.apache.mahout.cf.taste.model IDMigrator toLongID.

Prototype

long toLongID(String stringID);

Source Link

Usage

From source file:net.myrrix.common.OneWayMigratorTest.java

License:Apache License

@Test
public void testForward() throws Exception {
    IDMigrator migrator = new OneWayMigrator();
    assertEquals(4060265690780417169L, migrator.toLongID("foobar"));
    assertEquals(-3162216497309240828L, migrator.toLongID(""));
}

From source file:net.myrrix.online.generation.InputFilesReader.java

License:Apache License

static void readInputFiles(FastByIDMap<FastIDSet> knownItemIDs, FastByIDMap<FastByIDFloatMap> rbyRow,
        FastByIDMap<FastByIDFloatMap> rbyColumn, FastIDSet itemTagIDs, FastIDSet userTagIDs, File inputDir)
        throws IOException {

    FilenameFilter csvFilter = new PatternFilenameFilter(".+\\.csv(\\.(zip|gz))?");

    File[] otherFiles = inputDir.listFiles(new InvertedFilenameFilter(csvFilter));
    if (otherFiles != null) {
        for (File otherFile : otherFiles) {
            log.info("Skipping file {}", otherFile.getName());
        }/*w  w w .j a v a 2 s .  c  om*/
    }

    File[] inputFiles = inputDir.listFiles(csvFilter);
    if (inputFiles == null) {
        log.info("No input files in {}", inputDir);
        return;
    }
    Arrays.sort(inputFiles, ByLastModifiedComparator.INSTANCE);

    IDMigrator hash = new OneWayMigrator();

    int lines = 0;
    int badLines = 0;
    for (File inputFile : inputFiles) {
        log.info("Reading {}", inputFile);
        for (String line : new FileLineIterable(inputFile)) {

            if (badLines > 100) { // Crude check
                throw new IOException("Too many bad lines; aborting");
            }

            lines++;

            if (line.isEmpty() || line.charAt(0) == '#') {
                continue;
            }

            Iterator<String> it = COMMA.split(line).iterator();

            long userID;
            boolean userIsTag;
            long itemID;
            boolean itemIsTag;
            float value;
            try {

                String userIDString = it.next();
                userIsTag = userIDString.startsWith("\"");
                if (userIsTag) {
                    userID = hash.toLongID(userIDString.substring(1, userIDString.length() - 1));
                } else {
                    userID = Long.parseLong(userIDString);
                }

                String itemIDString = it.next();
                itemIsTag = itemIDString.startsWith("\"");
                if (itemIsTag) {
                    itemID = hash.toLongID(itemIDString.substring(1, itemIDString.length() - 1));
                } else {
                    itemID = Long.parseLong(itemIDString);
                }

                if (it.hasNext()) {
                    String valueToken = it.next();
                    value = valueToken.isEmpty() ? Float.NaN : LangUtils.parseFloat(valueToken);
                } else {
                    value = 1.0f;
                }

            } catch (NoSuchElementException ignored) {
                log.warn("Ignoring line with too few columns: '{}'", line);
                badLines++;
                continue;
            } catch (IllegalArgumentException iae) { // includes NumberFormatException
                if (lines == 1) {
                    log.info("Ignoring header line: '{}'", line);
                } else {
                    log.warn("Ignoring unparseable line: '{}'", line);
                    badLines++;
                }
                continue;
            }

            if (userIsTag && itemIsTag) {
                log.warn("Two tags not allowed: '{}'", line);
                badLines++;
                continue;
            }

            if (userIsTag) {
                itemTagIDs.add(userID);
            }

            if (itemIsTag) {
                userTagIDs.add(itemID);
            }

            if (Float.isNaN(value)) {
                // Remove, not set
                MatrixUtils.remove(userID, itemID, rbyRow, rbyColumn);
            } else {
                MatrixUtils.addTo(userID, itemID, value, rbyRow, rbyColumn);
            }

            if (knownItemIDs != null) {
                FastIDSet itemIDs = knownItemIDs.get(userID);
                if (Float.isNaN(value)) {
                    // Remove, not set
                    if (itemIDs != null) {
                        itemIDs.remove(itemID);
                        if (itemIDs.isEmpty()) {
                            knownItemIDs.remove(userID);
                        }
                    }
                } else {
                    if (itemIDs == null) {
                        itemIDs = new FastIDSet();
                        knownItemIDs.put(userID, itemIDs);
                    }
                    itemIDs.add(itemID);
                }
            }

            if (lines % 1000000 == 0) {
                log.info("Finished {} lines", lines);
            }
        }
    }

    log.info("Pruning near-zero entries");
    removeSmall(rbyRow);
    removeSmall(rbyColumn);
}