#/bin/bash

SOURCE="${BASH_SOURCE[0]}"
while [ -h "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
  DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"
  SOURCE="$(readlink "$SOURCE")"
  [[ $SOURCE != /* ]] && SOURCE="$DIR/$SOURCE" # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
done
SCRIPT_DIR="$( cd -P "$( dirname "$SOURCE" )" && pwd )"

# === ML Server Pyspark ===
#
# Find the "pyspark" command to execute and set revoscalpy pyspark interop env variables.
#
# Implements the logic of searching for the "pyspark" command-line
# utility. It is used mainly in python when launching pyspark with revoscalepy interop.
#
  
# Look for spark home location in the system
function find_spark_home()
{
  if [ ! -d "${SPARK_HOME}" ] ; then
    SPARK_BIN=$(which spark-submit 2>/dev/null)
    if [ -z "${SPARK_BIN}" ] ; then
      RPM=`which rpm 2>/dev/null`
      if [ ! -z $RPM ] ; then
        SPARK_PACKAGE=`${RPM} -qa 2>/dev/null | grep spark`
        if [ ! -z "${SPARK_PACKAGE}" ] ;  then
          SPARK_BIN=`${RPM} -ql ${SPARK_PACKAGE} 2>/dev/null | grep -m 1 spark-submit$`
        fi
      fi
  
      # handle ubuntu os
      DPKG_QUERY=`which dpkg-query 2>/dev/null`
      if [ ! -z $DPKG_QUERY ] ; then
        SPARK_PACKAGE=`${DPKG_QUERY} -W -f='${Package}\n' 2>/dev/null | grep spark`
        if [ ! -z "${SPARK_PACKAGE}" ] ;  then
          SPARK_BIN=`${DPKG_QUERY} -L ${SPARK_PACKAGE} 2>/dev/null | grep -m 1 spark-submit$`
        fi
      fi
    fi
    if [ -n "${SPARK_BIN}" ]; then
      SPARK_BIN="$( readlink -f ${SPARK_BIN} 2>/dev/null )"
      SPARK_HOME="$( readlink -m ${SPARK_BIN}/../../ 2>/dev/null )"
    fi

    if [ -z "${SPARK_HOME}" ] || [ ! -d "${SPARK_HOME}/conf" ] || [ ! -d "${SPARK_HOME}/yarn" ] ; then
      # if the SPARK_HOME found is not as expected. Then try HDP specific logic first
      # reference logic in HDP /usr/bin/spark-script-wrapper.sh
      if [ -z "${SPARK_MAJOR_VERSION}" ]; then
        spark_versions="$(ls -1 "/usr/hdp/current" 2>/dev/null | grep "^spark.*-client$")"

        num_spark=0
        for i in $spark_versions; do
          tmp="/usr/hdp/current/${i}"
          if [ -d "${tmp}" ]; then
            num_spark=$(( $num_spark + 1 ))
            SPARK_HOME="${tmp}"
          fi
        done

        if [ "${num_spark}" -gt "1" ]; then
          echo "Multiple versions of Spark are installed but SPARK_MAJOR_VERSION is not set" 1>&2
          echo "Spark2 will be picked by default" 1>&2
          SPARK_HOME="/usr/hdp/current/spark2-client"
        fi

      elif [ "${SPARK_MAJOR_VERSION}" -eq "1" ]; then
        echo "SPARK_MAJOR_VERSION is set to 1, while current ML SERVER only support Spark 2." 1>&2
        exit 1
      elif [ "${SPARK_MAJOR_VERSION}" -eq "2" ]; then
        SPARK_HOME="/usr/hdp/current/spark2-client"
      fi
    fi 

    if [ -z "${SPARK_HOME}" ] || [ ! -d "${SPARK_HOME}/conf" ] || [ ! -d "${SPARK_HOME}/yarn" ] ; then
      # if the SPARK_HOME found is still not as expected. Then try CDH specific logic
      # pick the last as SPARK HOME if multiple SPARK 2 versions exist
      CDH_SPARK_BASE="/log/cloudera/parcels/SPARK2*"
      for SPARK_DIR in ${CDH_SPARK_BASE}; do
        # if is a non-symlink dir
        if [ -d "${SPARK_DIR}" ] && [ ! -L "${SPARK_DIR}" ] ; then
          SPARK_HOME="${SPARK_DIR}/lib/spark2"
        fi
      done
    fi

    if [ -z "${SPARK_HOME}" ] || [ ! -d "${SPARK_HOME}/conf" ] || [ ! -d "${SPARK_HOME}/yarn" ] ; then
      # still cannot find the SPARK_HOME. Fail.
      echo "Fail to find SPARK_HOME. Please set SPARK_HOME variable."
      exit 1
    fi
  fi
}
find_spark_home

if [ -x "${SPARK_HOME}/bin/pyspark" ] ; then
    PYSPARK_BIN="${SPARK_HOME}/bin/pyspark"
else
    if [ ! -x "/usr/bin/pyspark" ]; then
        echo "cannot find the 'pyspark' script/executable" >&2
        exit -1
    fi
    PYSPARK_BIN="/usr/bin/pyspark"
fi

if [ -x "/usr/bin/mlserver-python" ] ; then
    MLSERVER_PYTHON="$( readlink -f /usr/bin/mlserver-python )"
else
    MLSERVER_PYTHON="$( readlink -m ${SCRIPT_DIR}/../python/python )"
    if [ ! -x "${MLSERVER_PYTHON}" ]; then
        echo "cannot find the 'mlserver-python' script/executable" >&2
        exit -1
    fi
fi

function set_pyspark_interop()
{
  # for pyspark interop, make sure pyspark point to mlserver-python
  export PYSPARK_PYTHON=${MLSERVER_PYTHON}
  # for CDH 5.8 above version, also need to set this param
  export PYSPARK_DRIVER_PYTHON=${MLSERVER_PYTHON}
  # find pyspark lib to include
  if [ -d "${SPARK_HOME}/python" ] ; then
    export PYTHONPATH=${SPARK_HOME}/python:${PYTHONPATH}
  fi 
  # find py4j lib to include
  ALL_PY4J="$(find ${SPARK_HOME}/python/lib/ -maxdepth 1 -type f -name "py4j*src.zip" 2> /dev/null)"
  PY4J=${ALL_PY4J[0]}
  if [ -f "$PY4J" ] ; then
    export PYTHONPATH=${PY4J}:${PYTHONPATH}
  fi
}
set_pyspark_interop

REVOSCALEPY_SPARK_JAR="$( readlink -m ${SCRIPT_DIR}/../../libraries/common/hadoop/jar/scaler-spark_2.11-0.1.0.jar )"
if [ ! -f "${REVOSCALEPY_SPARK_JAR}" ] ; then
    echo "cannot find the 'scaler-spark_2.11-0.1.0.jar' in path ${REVOSCALEPY_SPARK_JAR}. Please check your mlserver installation." >&2
    exit -1
fi

# Same logic located in Revo-init, Need to be changed simultaneously
function loadExportFileLineByLine()
{
  local EXPORT_FILE=$1
  while read LINE ; do
    LINE="${LINE/export /}"
    # split LINE in = character, into variable name and value
    VAR_NAME="${LINE%%=*}"
    VAR_VALUE="${LINE#${VAR_NAME}=}"
    if [ -n "${VAR_VALUE}" ] ; then
      # if the variable is not empty, export it in this session
      eval "export '${VAR_NAME}'='${VAR_VALUE}'"
    fi
  done < $EXPORT_FILE
}

if [ -e "${SCRIPT_DIR}/../Revo-init" ] ; then
  # need to define ML_SERVER_ROOT before sourcing Revo-init
  ML_SERVER_ROOT="${SCRIPT_DIR}/../.."
  source "${SCRIPT_DIR}/../Revo-init"
else
  REVOSCALEPY_ENV_VARS_FILE="$( readlink -m ${SCRIPT_DIR}/../../libraries/common/hadoop/RevoHadoopEnvVars.site )"
  if [ ! -f "${REVOSCALEPY_ENV_VARS_FILE}" ] ; then
      echo "cannot find the 'RevoHadoopEnvVars.site' in path ${REVOSCALEPY_ENV_VARS_FILE}. Please check your mlserver installation." >&2
      exit -1
  fi
  # load .site file
  # It can help solving CDH 5.7 and 5.8 JAVA_HOME setting issue
  loadExportFileLineByLine ${REVOSCALEPY_ENV_VARS_FILE} 2>/dev/null
fi

ARGS_TO_CALL=""
for var in "$@"
do
    if [[ $var == "spark.jars="* ]]; then
        ARGS_TO_CALL="${ARGS_TO_CALL} ${var},${REVOSCALEPY_SPARK_JAR}"
        ADDED_REVOSCALEPY_SPARK_JAR=TRUE
    else
        ARGS_TO_CALL="${ARGS_TO_CALL} $var"
    fi
done
if [ -z "${ADDED_REVOSCALEPY_SPARK_JAR}" ]; then
    ARGS_TO_CALL="${ARGS_TO_CALL} --conf spark.jars=${REVOSCALEPY_SPARK_JAR}"
fi

exec "${PYSPARK_BIN}" ${ARGS_TO_CALL}
