Source code

Java tutorial


Here is the source code for


/* Copyright 2014 MITRE Corporation
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package org.mitre.provenance.capture.linux;

import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.attribute.UserPrincipal;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.logging.Logger;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.mitre.provenance.Metadata;
import org.mitre.provenance.PLUSException;
import org.mitre.provenance.client.AbstractProvenanceClient;
import org.mitre.provenance.client.LocalProvenanceClient;
import org.mitre.provenance.client.ProvenanceClient;
import org.mitre.provenance.contenthash.ContentHasher;
import org.mitre.provenance.contenthash.SHA256ContentHasher;
import org.mitre.provenance.db.neo4j.Neo4JPLUSObjectFactory;
import org.mitre.provenance.npe.NonProvenanceEdge;
import org.mitre.provenance.plusobject.PLUSEdge;
import org.mitre.provenance.plusobject.PLUSFile;
import org.mitre.provenance.plusobject.PLUSInvocation;
import org.mitre.provenance.plusobject.PLUSObject;
import org.mitre.provenance.plusobject.PLUSWorkflow;
import org.mitre.provenance.plusobject.ProvenanceCollection;
import org.mitre.provenance.user.User;

 * This class is an operating system monitoring class for UNIX-based operating systems which support the proc filesystem.
 * For more information about procfs, see
 * <p>Basically, this polls available OS information about processes that are running, and then saves that information as provenance.
 * The OS will tell us for example which process IDs (PIDs) have which files open for read and write, and what the command line is
 * of the application that executed.
 * r
 * <p>We have to apply a few basic fingerprinting techniques to avoid logging duplicates.
 * <p>This code could doubtless see many improvements, but it's a basic proof of concept for how to collect provenance in real systems.
 * For many users, this kind of provenance would be seen as too granular, but it can produce some very interesting findings; in 
 * particular, because we use content-bound identifiers on everything that we encounter, this can establish linkages between 
 * different processes that read and use the same files.
 * <p>A major weakness of this capture approach is that you can never know when in the process lifecycle to scan a particular PID.
 * Which assets the process is using vary dramatically (particularly for long-lived processes) depending on when you hit it in 
 * the lifecycle.  Improvements should focus around appending in subsequent polls. 
 * @author moxious
public class PROCtor {
    protected static final Logger log = Logger.getLogger(PROCtor.class.getName());
    protected String myPID = null;
    public static final LRUCache<String, PLUSObject> cache = new LRUCache<String, PLUSObject>(1000);

    protected HashSet<String> pollPIDs = new HashSet<String>();
    protected static AbstractProvenanceClient client = new LocalProvenanceClient();
    protected SHA256ContentHasher hasher = new SHA256ContentHasher();

    public static final String UUID_KEY = "file_uuid";

     * Signals that an object already exists.
     * @author david
    public static class ExistsException extends PLUSException {
        private static final long serialVersionUID = 11233123L;
        protected PLUSObject o;

        public ExistsException(PLUSObject obj) {
            this.o = obj;

        public PLUSObject getObject() {
            return o;

    public void addPID(String pid) {

    //HashMap<String,PLUSObject> cache = new HashMap<String,PLUSObject>();   
    protected static File PROC = new File("/proc");

    public PROCtor() throws Exception {
        myPID = PROCtor.getMyPID();

    public void run(long pollTimeoutMs, int times) throws Exception {
        int x = 0;

        while (true) {
            if (times > 0 && x >= times)


    protected List<String> slurpLines(File f) {
        BufferedReader br = null;
        ArrayList<String> lines = new ArrayList<String>();

        try {
            br = new BufferedReader(new FileReader(f));
            String line = null;
            while ((line = br.readLine()) != null)
            return lines;
        } catch (IOException exc) {
            return null;
        } finally {
            try {
            } catch (IOException e) {
    } // End slurpLines

     * Read the complete contents of a file and return them as a string.   Simple utility for tiny files.
     * @param f file to read.
     * @return the complete text contents
    protected String slurp(File f) {
        BufferedReader br = null;
        try {
            br = new BufferedReader(new FileReader(f));
            StringBuffer b = new StringBuffer("");
            String line = null;
            while ((line = br.readLine()) != null)
            return b.toString();
        } catch (IOException ioe) {
            return null;
        } finally {
            try {
            } catch (IOException e) {

     * Computes a special identifier for files based on their path and when they were last modified.  This is not a content-bound identifier,
     * but can be used in case a duplicate file has been seen on the same system.
     * @param f the file to use
     * @return a string identifier
     * @throws NoSuchAlgorithmException
     * @throws IOException
    protected String getIDForFile(File f) throws NoSuchAlgorithmException, IOException {
        // Unique ID for a file based on its absolute pathname, and last modified date.
        // When this hash value changes, you know it's a different file.
        String stamp = f.getCanonicalPath() + "-" + f.lastModified();
        return ContentHasher.formatAsHexString(hasher.hash(new ByteArrayInputStream(stamp.getBytes())));

     * Polls through all available items in the proc fs, and processes them individually.
     * @throws IOException
     * @throws NoSuchAlgorithmException
     * @throws PLUSException
    protected void poll() throws IOException, NoSuchAlgorithmException, PLUSException {
        String[] PIDs = PROC.list(new FilenameFilter() {
            public boolean accept(File dir, String name) {
                // Match only filenames that are entirely numeric.
                // These filenames correspond to system PIDs (process IDs)
                return name.matches("^[0-9]+$");

        for (String pid : PIDs) {
            if (pid.equals(myPID))
                continue; // Don't process myself.

            if (pollPIDs.isEmpty() || pollPIDs.contains(pid))
                processPID(new File(PROC, pid));

    protected ProcFDInfo getFDInfo(File procPID, String fd) {
        File fdInfoFile = new File(new File(procPID, "fdinfo"), fd);

        if (!fdInfoFile.exists())
            return null;

        List<String> lines = slurpLines(fdInfoFile);

        String flags = null;
        String pos = null;

        for (String line : lines) {
            if (line.indexOf(':') != -1) {
                String[] toks = line.split("[ \\t]+");
                if (toks[0].contains("pos"))
                    pos = toks[1];
                else if (toks[0].contains("flags"))
                    flags = toks[1];
                    log.warning("Unexpected line '" + line + "' in " + fdInfoFile.getAbsolutePath());
            } else
                // Ignore other lines, (inotify, tfd, eventfd-count, others)

            if (flags != null && pos != null)

        // Shouldn't happen...
        if (pos == null || flags == null)
            return null;

        return new ProcFDInfo(pos, flags);

     * Processes a PID identified by a particular /proc filesystem path, and creates the necessary provenance objects.
     * @param procPID
     * @throws IOException
     * @throws NoSuchAlgorithmException
     * @throws PLUSException
    protected void processPID(File procPID) throws IOException, NoSuchAlgorithmException, PLUSException {
        if (!procPID.exists()) {
            log.warning("PID " + procPID + " doesn't exist.");

        PLUSInvocation inv = createOrRetrieveInvocation(procPID);
        if (inv == null)

        String[] fileDescriptors = null;
        File fds = new File(procPID, "fd");
        fileDescriptors = fds.list();

        if (fileDescriptors == null) {
        } // No permissions here.

        ProvenanceCollection pcol = new ProvenanceCollection();

        boolean revisiting = false;

        if (client.exists(inv) != null)
            revisiting = true;

        List<String> inputs = new ArrayList<String>();
        List<String> outputs = new ArrayList<String>();
        List<String> related = new ArrayList<String>();

        for (String fdName : fileDescriptors) {
            File fdFile = new File(fds, fdName);

            // We get the canonical file to resolve the procfs symlink, so that 
            // we're gathering metadata about the file, and not a symlink to the file.
            File canonical = fdFile.getCanonicalFile();

            boolean previouslyWritten = false;
            PLUSObject fdObj = null;

            // This is what will let us know whether the file was open for input/output, or whatever.
            ProcFDInfo fdInfo = getFDInfo(procPID, fdName);
            if (fdInfo == null) {
                log.warning("Couldn't get fdInfo for " + procPID + "/fdinfo/" + fdName);

            try {
                fdObj = createOnlyIfNew(canonical);
            } catch (ExistsException e) {
                // There is a valid file here, but we've already seen it.  That means don't add it
                // to the collection or try to re-write it.
                previouslyWritten = true;
                fdObj = e.getObject();

            if (fdObj == null)

            if (!previouslyWritten) {
                fdObj.getMetadata().put("unix:fd", fdName);

            // It's an output if we're appending to it, creating it, writing only to it, or truncating it.
            if (fdInfo.O_APPEND() || fdInfo.O_CREAT() || fdInfo.O_WRONLY() || fdInfo.O_TRUNC())
                outputs.add("" + fdObj.getMetadata().get(UUID_KEY));
            // It's an input if we're read only.
            else if (fdInfo.O_RDONLY())
                inputs.add("" + fdObj.getMetadata().get(UUID_KEY));
            else if (fdInfo.O_RDWR())
                related.add("" + fdObj.getMetadata().get(UUID_KEY));
            else {
                log.warning("Ambiguous mode for " + procPID + "/fdinfo/" + fdName + ": " + fdInfo.getFlags());

            if (fdFile.canWrite())
                outputs.add("" + fdObj.getMetadata().get(UUID_KEY));
                inputs.add("" + fdObj.getMetadata().get(UUID_KEY));

            String file_uuid = "" + fdObj.getMetadata().get(UUID_KEY);

            if (previouslyWritten)
                pcol.addNonProvenanceEdge(new NonProvenanceEdge(fdObj, file_uuid, UUID_KEY));

        for (String id : inputs) {
            PLUSObject o = (PLUSObject) cache.get(id);
            if (o != null)
                pcol.addEdge(new PLUSEdge(o, inv));

        for (String id : outputs) {
            PLUSObject o = (PLUSObject) cache.get(id);
            if (o != null)
                pcol.addEdge(new PLUSEdge(inv, o));

        for (String id : related) {
            // Just mark these as "contributing".
            PLUSObject o = (PLUSObject) cache.get(id);
            if (o != null)
                pcol.addEdge(new PLUSEdge(o, inv, PLUSWorkflow.DEFAULT_WORKFLOW, PLUSEdge.EDGE_TYPE_CONTRIBUTED));

        boolean written = false;

        if (pcol.countNodes() > 0)
            written =;

        if (written)
   ? "REVISITED" : "NEW") + ": " + inv.getMetadata().get("cmdline") + " PID "
                    + inv.getMetadata().get("pid") + " => " + inputs.size() + " inputs, " + outputs.size()
                    + " outputs.  Total written=" + written);

    public boolean isSymlink(File file) throws IOException {
        if (file == null)
            return false;

        File canon;
        if (file.getParent() == null)
            canon = file;
        else {
            File canonDir = file.getParentFile().getCanonicalFile();
            canon = new File(canonDir, file.getName());

        return !canon.getCanonicalFile().equals(canon.getAbsoluteFile());

     * Return the PID of the process that PROCtor is running underneath.
     * @return
    public static String getMyPID() {
        String pidStr = ManagementFactory.getRuntimeMXBean().getName();

        int idx = pidStr.indexOf("@");

        if (idx == -1)
            return pidStr;
            return pidStr.substring(0, idx);

     * Get or create a new PLUSInvocation on the basis of a proc PID file, e.g. /proc/56 (pid 56)
     * Returns null for insufficient permissions, or when  you shouldn't log a particular pid.  (For 
     * example, this program will not log its own run)
    public PLUSInvocation createOrRetrieveInvocation(File procPID) throws NoSuchAlgorithmException, IOException {
        String procFileID = getIDForFile(procPID);
        if (procFileID == null)
            return null;

        String pid = procPID.getName();
        if (pid.equals(myPID))
            return null; // Don't log myself.

        String[] children = procPID.list();
        if (children == null)
            return null; // No permissions.

        if (cache.containsKey(procFileID))
            return (PLUSInvocation) cache.get(procFileID);

        try {
            ProvenanceCollection results = Neo4JPLUSObjectFactory.loadBySingleMetadataField(User.DEFAULT_USER_GOD,
                    UUID_KEY, procFileID);
            if (results != null && results.countNodes() > 0) {
                PLUSInvocation i = (PLUSInvocation) results.getNodes().toArray()[0];
                cache.put(procFileID, i);
                return i;
        } catch (PLUSException exc) {

        long lmod = procPID.lastModified();

        String cmdline = slurp(new File(procPID, "cmdline"));
        File exe = new File(procPID, "exe").getCanonicalFile();
        File cwd = new File(procPID, "cwd").getCanonicalFile();

        PLUSInvocation inv = new PLUSInvocation(exe.getCanonicalPath());
        inv.getMetadata().put("pid", pid);
        inv.getMetadata().put("cwd", cwd.getCanonicalPath());
        inv.getMetadata().put("cmdline", cmdline);
        inv.getMetadata().put("started", "" + lmod);
        inv.getMetadata().put(UUID_KEY, procFileID);
        inv.getMetadata().put(Metadata.CONTENT_HASH_SHA_256, procFileID);

        Path path = Paths.get(procPID.getAbsolutePath());
        UserPrincipal owner = Files.getOwner(path);
        String username = owner.getName();
        try {
            inv.setOwner(Neo4JPLUSObjectFactory.getActor(username, true));
        } catch (PLUSException exc) {
            log.warning("Failed to set owner for " + inv + ": " + exc.getMessage());

        cache.put(procFileID, inv); // Cache this so we don't go back over it.

        return inv;

     * Create a PLUSObject corresponding to a given file, only if that file is new.  Note that throwing an
     * ExistsException is not an error condition, to signal to the caller that provenance already exists.
     * @param f the file to inspect.
     * @return a PLUSObject if it is new.
     * @throws ExistsException if provenance already exists for that object, this will be thrown.
     * @throws NoSuchAlgorithmException on error
     * @throws IOException on error.
    public PLUSObject createOnlyIfNew(File f) throws ExistsException, NoSuchAlgorithmException, IOException {
        if (f == null || !f.exists())
            return null;

        if (!f.isFile())
            return null; // Don't log things like sockets right now.
        String id = getIDForFile(f);

        if (id == null) {
            log.warning("Couldn't compute file id for " + f);
            return null;

        if (cache.containsKey(id))
            throw new ExistsException(cache.get(id));

        ProvenanceCollection results = null;

        try {
            results = Neo4JPLUSObjectFactory.loadBySingleMetadataField(User.DEFAULT_USER_GOD, UUID_KEY, id, 1);
        } catch (PLUSException exc) {
            throw new RuntimeException(exc);

        if (results != null && results.countNodes() > 0) {
            PLUSObject o = (PLUSObject) results.getNodes().toArray()[0];
            cache.put(id, o);
            throw new ExistsException(o);

        PLUSFile pf = new PLUSFile(f);
        pf.getMetadata().put(UUID_KEY, id);

        if (id != null)
            cache.put(id, pf);

        if (f.isFile()) {
            long fileSize = 0;
            try {
                fileSize = f.length();
            } catch (Exception exc) {
                return pf;

            // Best effort to hash the content.
            if (fileSize < 1024 * 1024) {
                FileInputStream fis = null;
                try {
                    fis = new FileInputStream(f);
                    String sha256hash = ContentHasher.formatAsHexString(hasher.hash(fis));
                    pf.getMetadata().put(Metadata.CONTENT_HASH_SHA_256, sha256hash);
                } catch (IOException exc) {
                } finally {
                    if (fis != null)
                        try {
                        } catch (Exception e) {

        return pf;

    public static Options makeCLIOptions() {
        Options options = new Options();

                .withDescription("If specified, capture only provenance for this single PID and its children.")

                .withDescription("Poll the PID fs once, and then quit").create("once"));

                .withDescription("Poll continuously until user interrupts.").create("poll"));

        return options;

    public static void usage() {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp("PROCtor", makeCLIOptions());

     * If provided with arguments, the program processes only those PIDs. If given no arguments, it starts in polling mode.
    public static void main(String[] args) throws Exception {
        ProvenanceClient.instance = client;

        CommandLineParser parser = new GnuParser();

        if (!PROC.exists()) {
                    "This utility is intended to run on Linux systems with a PROC filesystem. You do not appear to have one (or it is not readable)");

        try {
            CommandLine line = parser.parse(makeCLIOptions(), args);
            String pidArg = line.getOptionValue("pid");

            boolean once = line.hasOption("once");
            boolean poll = line.hasOption("poll");

            System.out.println("Once " + once + " poll " + poll);

            PROCtor p = new PROCtor();

            if (once && poll) {
                System.err.println("You can't specify both to run once and to poll.");

            // Default is to poll if user hasn't otherwise specified.
            if (!poll && !once)
                poll = true;

            if (pidArg != null) {
                System.out.println("PID=" + pidArg);
                String[] pids = pidArg.split(" +");

                for (String pid : pids) {

            if (poll)
      , -1);
      , 1);
        } catch (ParseException exc) {
} // End PROCtor