com.mapr.synth.samplers.SsnSampler.java Source code

Java tutorial

Introduction

Here is the source code for com.mapr.synth.samplers.SsnSampler.java

Source

/*
 * Licensed to the Ted Dunning under one or more contributor license
 * agreements.  See the NOTICE file that may be
 * distributed with this work for additional information
 * regarding copyright ownership.  Ted Dunning licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.mapr.synth.samplers;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.JsonNodeFactory;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.databind.node.TextNode;
import com.google.common.base.Charsets;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.common.io.Resources;
import org.apache.mahout.common.RandomUtils;

import java.io.IOException;
import java.util.*;
import java.util.regex.Pattern;

/**
 * Samples from Social Security Numbers with roughly equal representation across different ages
 */
public class SsnSampler extends FieldSampler {
    private Random rand = RandomUtils.getRandom();

    private final JsonNodeFactory nodeFactory = JsonNodeFactory.withExactBigDecimals(false);
    private final Map<String, List<String>> values = Maps.newHashMap();
    private final List<String> codes = Lists.newArrayList();
    private Set<String> keepTypes = Sets.newHashSet("normal");
    private Set<String> keepFields = Sets.newHashSet("ssn", "state");
    private List<String> names;
    private boolean verbose = true;

    public SsnSampler() {
        Splitter onComma = Splitter.on(",").trimResults();
        try {
            names = null;
            for (String line : Resources.readLines(Resources.getResource("ssn-seeds"), Charsets.UTF_8)) {
                if (line.startsWith("#")) {
                    // last comment line contains actual field names
                    names = Lists.newArrayList(onComma.split(line.substring(1)));
                } else {
                    Preconditions.checkState(names != null);
                    assert names != null;

                    List<String> fields = Lists.newArrayList(onComma.split(line));
                    for (int i = Integer.parseInt(fields.get(1)); i <= Integer.parseInt(fields.get(1)); i++) {
                        String key = String.format("%03d", i);
                        values.put(key, fields.subList(2, fields.size()));
                        codes.add(key);
                    }

                }
            }
            assert names != null;
            names = names.subList(2, names.size());
        } catch (IOException e) {
            throw new RuntimeException("Couldn't read built-in resource", e);
        }
    }

    public void setSeed(long seed) {
        rand = new Random(seed);
    }

    /**
     * Limits the fields that are returned to only those that are specified.
     */
    @SuppressWarnings("UnusedDeclaration")
    public void setFields(String fields) {
        keepFields = Sets.newHashSet(Splitter.on(Pattern.compile("[\\s,;]+")).split(fields));
        for (String field : keepFields) {
            Preconditions.checkArgument(names.contains(field) || "ssn".equals(field), "Illegal field: %s", field);
        }
    }

    @SuppressWarnings("UnusedDeclaration")
    public void setTypes(String types) {
        keepTypes = Sets.newHashSet(Splitter.on(Pattern.compile("[\\s,;]+")).split(types));
        Set<String> legalTypes = ImmutableSet.of("normal", "extra");
        for (String type : keepTypes) {
            Preconditions.checkArgument(legalTypes.contains(type), "Illegal type requested: %s, needed one of %s",
                    type, legalTypes);
        }
    }

    @SuppressWarnings("UnusedDeclaration")
    public void setVerbose(boolean verbose) {
        this.verbose = verbose;
    }

    @Override
    public JsonNode sample() {
        while (true) {
            int i = rand.nextInt(codes.size());
            List<String> fields = values.get(codes.get(i));

            if (keepTypes.contains(fields.get(names.indexOf("type")))) {
                if (verbose) {
                    ObjectNode rx = new ObjectNode(nodeFactory);
                    Iterator<String> nx = names.iterator();
                    for (String field : fields) {
                        Preconditions.checkState(nx.hasNext());
                        String fieldName = nx.next();
                        if (keepFields.contains(fieldName)) {
                            rx.set(fieldName, new TextNode(field));
                        }
                    }
                    Preconditions.checkState(!nx.hasNext());
                    if (keepFields.contains("ssn")) {
                        rx.set("ssn", new TextNode(String.format("%s-%02d-%04d", codes.get(i), rand.nextInt(99) + 1,
                                rand.nextInt(9999) + 1)));
                    }
                    return rx;
                } else {
                    return new TextNode(String.format("%s-%02d-%04d", codes.get(i), rand.nextInt(99) + 1,
                            rand.nextInt(9999) + 1));
                }
            }
        }
    }
}