Java tutorial
/** * Copyright 2011 Twitter, Inc. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.twitter.pycascading; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Serializable; import java.net.URISyntaxException; import java.util.Iterator; import org.apache.hadoop.filecache.DistributedCache; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.python.core.Py; import org.python.core.PyDictionary; import org.python.core.PyFunction; import org.python.core.PyIterator; import org.python.core.PyList; import org.python.core.PyObject; import org.python.core.PyString; import org.python.core.PyTuple; import org.python.util.PythonInterpreter; import cascading.flow.FlowProcess; import cascading.flow.hadoop.HadoopFlowProcess; import cascading.operation.BaseOperation; import cascading.operation.OperationCall; import cascading.tuple.Fields; import cascading.tuple.TupleEntry; /** * Wrapper for a Cascading BaseOperation that prepares the input tuples for a * Python function. It can convert between tuples and Python lists and dicts. * * @author Gabor Szabo */ @SuppressWarnings({ "rawtypes", "deprecation" }) public class CascadingBaseOperationWrapper extends BaseOperation implements Serializable { private static final long serialVersionUID = -535185466322890691L; // This defines whether the input tuples should be converted to Python lists // or dicts before passing them to the Python function public enum ConvertInputTuples { NONE, PYTHON_LIST, PYTHON_DICT } private PyObject function; private ConvertInputTuples convertInputTuples; private PyTuple contextArgs = null; protected PyDictionary contextKwArgs = null; private PyFunction writeObjectCallBack; private byte[] serializedFunction; // These are some variables to optimize the frequent UDF calls protected PyObject[] callArgs = null; private String[] contextKwArgsNames = null; /** * Class to convert elements in an iterator to corresponding Jython objects. * * @author Gabor Szabo * * @param <I> * the type of the items */ class ConvertIterable<I> implements Iterator<PyObject> { private Iterator<I> iterator; public ConvertIterable(Iterator<I> iterator) { this.iterator = iterator; } @Override public boolean hasNext() { return iterator.hasNext(); } @Override public PyObject next() { return Py.java2py(iterator.next()); } @Override public void remove() { iterator.remove(); } } /** * This is necessary for the deserialization. */ public CascadingBaseOperationWrapper() { super(); } public CascadingBaseOperationWrapper(Fields fieldDeclaration) { super(fieldDeclaration); } public CascadingBaseOperationWrapper(int numArgs) { super(numArgs); } public CascadingBaseOperationWrapper(int numArgs, Fields fieldDeclaration) { super(numArgs, fieldDeclaration); } private PythonInterpreter setupInterpreter(JobConf jobConf, FlowProcess flowProcess) { String pycascadingDir = null; String sourceDir = null; String[] modulePaths = null; if ("hadoop".equals(jobConf.get("pycascading.running_mode"))) { try { Path[] archives = DistributedCache.getLocalCacheArchives(jobConf); pycascadingDir = archives[0].toString() + "/"; sourceDir = archives[1].toString() + "/"; modulePaths = new String[archives.length]; int i = 0; for (Path archive : archives) { modulePaths[i++] = archive.toString(); } } catch (IOException e) { throw new RuntimeException(e); } } else { pycascadingDir = System.getProperty("pycascading.root") + "/"; sourceDir = ""; modulePaths = new String[] { pycascadingDir, sourceDir }; } PythonInterpreter interpreter = Main.getInterpreter(); interpreter.execfile(pycascadingDir + "python/pycascading/init_module.py"); interpreter.set("module_paths", modulePaths); interpreter.eval("setup_paths(module_paths)"); // We set the Python variable "map_input_file" to the path to the mapper // input file // But this is unfortunately null with the old Hadoop API, see // https://groups.google.com/group/cascading-user/browse_thread/thread/d65960ad738bebd4/f343e91625cf3c07 // http://lucene.472066.n3.nabble.com/map-input-file-in-20-1-td961619.html // https://issues.apache.org/jira/browse/MAPREDUCE-2166 interpreter.set("map_input_file", jobConf.get("map.input.file")); // We set the Python variable "jobconf" to the MR jobconf interpreter.set("jobconf", jobConf); // The flowProcess passed to the Operation is passed on to the Python // function in the variable flow_process interpreter.set("flow_process", flowProcess); // We need to run the main file first so that imports etc. are defined, // and nested functions can also be used interpreter.execfile(sourceDir + (String) jobConf.get("pycascading.main_file")); return interpreter; } // We need to delay the deserialization of the Python functions up to this // point, since the sources are in the distributed cache, whose location is in // the jobconf, and we get access to the jobconf only at this point for the // first time. @Override public void prepare(FlowProcess flowProcess, OperationCall operationCall) { JobConf jobConf = ((HadoopFlowProcess) flowProcess).getJobConf(); PythonInterpreter interpreter = setupInterpreter(jobConf, flowProcess); ByteArrayInputStream baos = new ByteArrayInputStream(serializedFunction); try { PythonObjectInputStream pythonStream = new PythonObjectInputStream(baos, interpreter); function = (PyObject) pythonStream.readObject(); convertInputTuples = (ConvertInputTuples) pythonStream.readObject(); if ((Boolean) pythonStream.readObject()) contextArgs = (PyTuple) pythonStream.readObject(); if ((Boolean) pythonStream.readObject()) contextKwArgs = (PyDictionary) pythonStream.readObject(); baos.close(); } catch (Exception e) { // If there are any kind of exceptions (ClassNotFoundException or // IOException), we don't want to continue. throw new RuntimeException(e); } serializedFunction = null; if (!PyFunction.class.isInstance(function)) { // function is assumed to be decorated, resulting in a // DecoratedFunction, so we can get the original function back. // // Only for performance reasons. It's just as good to comment this // out, as a DecoratedFunction is callable anyway. // If we were to decorate the functions with other decorators as // well, we certainly cannot use this. try { function = (PyFunction) ((PyDictionary) (function.__getattr__(new PyString("decorators")))) .get(new PyString("function")); } catch (Exception e) { throw new RuntimeException( "Expected a Python function or a decorated function. This shouldn't happen."); } } setupArgs(); } private void writeObject(ObjectOutputStream stream) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); PythonObjectOutputStream pythonStream = new PythonObjectOutputStream(baos, writeObjectCallBack); pythonStream.writeObject(function); pythonStream.writeObject(convertInputTuples); pythonStream.writeObject(new Boolean(contextArgs != null)); if (contextArgs != null) { pythonStream.writeObject(contextArgs); } pythonStream.writeObject(new Boolean(contextKwArgs != null)); if (contextKwArgs != null) pythonStream.writeObject(contextKwArgs); pythonStream.close(); stream.writeObject(baos.toByteArray()); } private void readObject(ObjectInputStream stream) throws IOException, ClassNotFoundException, URISyntaxException { // TODO: we need to start up the interpreter and for all the imports, as // the parameters may use other imports, like datetime. Or how else can // we do this better? serializedFunction = (byte[]) stream.readObject(); } /** * We assume that the Python functions (map and reduce) are always called with * the same number of arguments. Override this to return the number of * arguments we will be passing in all the time. * * @return the number of arguments the wrapper is passing in */ public int getNumParameters() { return 0; } /** * Sets up the local variables that were not serialized for optimizations. */ protected void setupArgs() { int numArgs = getNumParameters(); callArgs = new PyObject[numArgs + (contextArgs == null ? 0 : contextArgs.size()) + (contextKwArgs == null ? 0 : contextKwArgs.size())]; int i = numArgs; if (contextArgs != null) { PyObject[] args = contextArgs.getArray(); for (PyObject arg : args) { callArgs[i] = arg; i++; } } if (contextKwArgs != null) { PyIterator values = (PyIterator) contextKwArgs.itervalues(); PyObject value = values.__iternext__(); while (value != null) { callArgs[i] = value; value = values.__iternext__(); i++; } contextKwArgsNames = new String[contextKwArgs.size()]; PyIterator keys = (PyIterator) contextKwArgs.iterkeys(); PyObject key = keys.__iternext__(); int j = 0; while (key != null) { contextKwArgsNames[j] = ((PyString) key).asString(); key = keys.__iternext__(); j++; } } } @SuppressWarnings("unchecked") public Object convertInput(TupleEntry tupleEntry) { Object result = null; if (convertInputTuples == ConvertInputTuples.NONE) { // We don't need to convert the tuples result = tupleEntry; } else if (convertInputTuples == ConvertInputTuples.PYTHON_LIST) { // The user wants a Python list result = new PyList(new ConvertIterable<Object>(tupleEntry.getTuple().iterator())); } else if (convertInputTuples == ConvertInputTuples.PYTHON_DICT) { // The user wants a Python dict PyObject[] dictElements = new PyObject[2 * tupleEntry.size()]; // Here we convert Java objects to Jython objects // http://osdir.com/ml/lang.jython.devel/2006-05/msg00022.html // If the fields are not named in the tuple, generate keys using // their integer index. int i = 0; Iterator<Object> iter = tupleEntry.getFields().iterator(); while (i < dictElements.length) { dictElements[i] = Py.java2py(iter.hasNext() ? iter.next() : i / 2); i += 2; } i = 1; for (Object value : tupleEntry.getTuple()) { dictElements[i] = Py.java2py(value); i += 2; } PyDictionary dict = new PyDictionary(dictElements); result = dict; } return result; } /** * This calls the Python function on behalf of the BaseOperation. The callArgs * field is protected, so that derived classes may put the function parameters * into it. * * @return the return value of the Python function */ public PyObject callFunction() { if (contextKwArgsNames == null) return function.__call__(callArgs); else return function.__call__(callArgs, contextKwArgsNames); } /** * Setter for the Python function object. * * @param function * the Python function */ public void setFunction(PyFunction function) { this.function = function; } /** * Setter for the input tuple conversion type. * * @param convertInputTuples * whether to do any conversion on input tuples, and the type of the * converted tuple (none/list/dict) */ public void setConvertInputTuples(ConvertInputTuples convertInputTuples) { this.convertInputTuples = convertInputTuples; } /** * Setter for the constant unnamed arguments that are passed in for the UDF * aside from the tuples. * * @param args * the additional unnamed arguments */ public void setContextArgs(PyTuple args) { contextArgs = args; setupArgs(); } /** * Setter for the constant named arguments that are passed in for the UDF * aside from the tuples. * * @param args * the additional unnamed arguments */ public void setContextKwArgs(PyDictionary kwargs) { contextKwArgs = kwargs; setupArgs(); } /** * The Python callback function to call to get the source of a PyFunction. We * better do it in Python using the inspect module, than hack it around in * Java. * * @param callBack * the PyFunction that is called to get the source of a Python * function */ public void setWriteObjectCallBack(PyFunction callBack) { this.writeObjectCallBack = callBack; } }