List of usage examples for org.apache.mahout.cf.taste.model IDMigrator toLongID
long toLongID(String stringID);
From source file:net.myrrix.common.OneWayMigratorTest.java
License:Apache License
@Test public void testForward() throws Exception { IDMigrator migrator = new OneWayMigrator(); assertEquals(4060265690780417169L, migrator.toLongID("foobar")); assertEquals(-3162216497309240828L, migrator.toLongID("")); }
From source file:net.myrrix.online.generation.InputFilesReader.java
License:Apache License
static void readInputFiles(FastByIDMap<FastIDSet> knownItemIDs, FastByIDMap<FastByIDFloatMap> rbyRow, FastByIDMap<FastByIDFloatMap> rbyColumn, FastIDSet itemTagIDs, FastIDSet userTagIDs, File inputDir) throws IOException { FilenameFilter csvFilter = new PatternFilenameFilter(".+\\.csv(\\.(zip|gz))?"); File[] otherFiles = inputDir.listFiles(new InvertedFilenameFilter(csvFilter)); if (otherFiles != null) { for (File otherFile : otherFiles) { log.info("Skipping file {}", otherFile.getName()); }/*w w w .j a v a 2 s . c om*/ } File[] inputFiles = inputDir.listFiles(csvFilter); if (inputFiles == null) { log.info("No input files in {}", inputDir); return; } Arrays.sort(inputFiles, ByLastModifiedComparator.INSTANCE); IDMigrator hash = new OneWayMigrator(); int lines = 0; int badLines = 0; for (File inputFile : inputFiles) { log.info("Reading {}", inputFile); for (String line : new FileLineIterable(inputFile)) { if (badLines > 100) { // Crude check throw new IOException("Too many bad lines; aborting"); } lines++; if (line.isEmpty() || line.charAt(0) == '#') { continue; } Iterator<String> it = COMMA.split(line).iterator(); long userID; boolean userIsTag; long itemID; boolean itemIsTag; float value; try { String userIDString = it.next(); userIsTag = userIDString.startsWith("\""); if (userIsTag) { userID = hash.toLongID(userIDString.substring(1, userIDString.length() - 1)); } else { userID = Long.parseLong(userIDString); } String itemIDString = it.next(); itemIsTag = itemIDString.startsWith("\""); if (itemIsTag) { itemID = hash.toLongID(itemIDString.substring(1, itemIDString.length() - 1)); } else { itemID = Long.parseLong(itemIDString); } if (it.hasNext()) { String valueToken = it.next(); value = valueToken.isEmpty() ? Float.NaN : LangUtils.parseFloat(valueToken); } else { value = 1.0f; } } catch (NoSuchElementException ignored) { log.warn("Ignoring line with too few columns: '{}'", line); badLines++; continue; } catch (IllegalArgumentException iae) { // includes NumberFormatException if (lines == 1) { log.info("Ignoring header line: '{}'", line); } else { log.warn("Ignoring unparseable line: '{}'", line); badLines++; } continue; } if (userIsTag && itemIsTag) { log.warn("Two tags not allowed: '{}'", line); badLines++; continue; } if (userIsTag) { itemTagIDs.add(userID); } if (itemIsTag) { userTagIDs.add(itemID); } if (Float.isNaN(value)) { // Remove, not set MatrixUtils.remove(userID, itemID, rbyRow, rbyColumn); } else { MatrixUtils.addTo(userID, itemID, value, rbyRow, rbyColumn); } if (knownItemIDs != null) { FastIDSet itemIDs = knownItemIDs.get(userID); if (Float.isNaN(value)) { // Remove, not set if (itemIDs != null) { itemIDs.remove(itemID); if (itemIDs.isEmpty()) { knownItemIDs.remove(userID); } } } else { if (itemIDs == null) { itemIDs = new FastIDSet(); knownItemIDs.put(userID, itemIDs); } itemIDs.add(itemID); } } if (lines % 1000000 == 0) { log.info("Finished {} lines", lines); } } } log.info("Pruning near-zero entries"); removeSmall(rbyRow); removeSmall(rbyColumn); }