#/bin/bash

# === MRS Spark ===
#
# Find the "spark-submit" command.
#
# Implements the logic of searching for the "spark-submit" command-line
# utility. It is used mainly in R when connecting from a remote
# client and running spark-submit commands via ssh.
#
  
# Look for spark-submit location in the system
if [ -x "${SPARK_HOME}/bin/spark-submit" ] ; then
    SPARK_BIN="${SPARK_HOME}/bin/spark-submit"
else
    SPARK_BIN=$(which spark-submit 2>/dev/null)
    if [ -z "${SPARK_BIN}" ] ; then
      RPM=`which rpm 2>/dev/null`
      if [ ! -z $RPM ] ; then 
        SPARK_PACKAGE=`${RPM} -qa | grep spark`
        if [ ! -z "${SPARK_PACKAGE}" ] ;  then
          SPARK_BIN=`${RPM} -ql ${SPARK_PACKAGE} | grep -m 1 spark-submit$`
        fi
      fi

      # handle ubuntu os
      DPKG_QUERY=`which dpkg-query 2>/dev/null`
      if [ ! -z $DPKG_QUERY ] ; then
        SPARK_PACKAGE=`${DPKG_QUERY} -W -f='${Package}\n' | grep spark`
        if [ ! -z "${SPARK_PACKAGE}" ] ;  then
          SPARK_BIN=`${DPKG_QUERY} -L mapr-spark ${SPARK_PACKAGE} | grep -m 1 spark-submit$`
        fi
      fi

    fi
fi



command=$1

function print_help()
{
cat << EOF
  SYNOPSIS:
      mrs-spark-submit --mrs-help
      mrs-spark-submit --mrs-home
      mrs-spark-submit --mrs-version
      mrs-spark-submit [values to run spark-submit]

  DESCRIPTION:
      The wrapper script to locate spark-submit command and run it. Or find spark home location. Or find spark version.

  COMMAND:
      mrs-spark-submit --mrs-help
          Print help message of this script.
      mrs-spark-submit --mrs-home
          Find the SPARK HOME and print in stdout.
      mrs-spark-submit --mrs-version
          Find the SPARK VERSION and print in stdout.
      mrs-spark-submit [values to run spark-submit]
          Run spark-submit command with extra values from input.

  OPTION:
      --mrs-help       [optional] Print help message of this script.
      --mrs-home       [optional] Find the SPARK HOME and print in stdout.
      --mrs-version    [optional] Find the SPARK VERSION and print in stdout.

  RETURN CODE:
      0 for success.
      < 0 for error.
      > 0 for warning.
EOF
}

function report_no_spark_home_error()
{
  echo "%PROTOCOL-STDOUT%"
  echo "%PROTOCOL-STDERR%"
  echo "Failed to detect Spark home directory. Please set environment variable"
  echo "SPARK_HOME for this user (e.g. in $HOME/.bashrc) or for the whole system"
  echo "(e.g. in /etc/environment) to point Microsoft R Server to the correct"
  echo "Spark home directory. A valid path SPARK_HOME will correctly resolve"
  echo "to \$SPARK_HOME/bin/spark-submit"
  echo "%PROTOCOL-END%"
  exit -1
}

function report_no_spark_version_error()
{
  echo "%PROTOCOL-STDOUT%"
  echo "%PROTOCOL-STDERR%"
  echo "Failed to detect Spark version. Please define environment variable"
  echo "SPARK_VERSION for this user (e.g. in $HOME/.bashrc) or for the whole system"
  echo "(e.g. in /etc/environment) to point Microsoft R Server to the correct"
  echo "Spark version."
  echo "%PROTOCOL-END%"
  exit -1
}

function find_hdp_spark_home()
{
  if [ ! -d "/usr/hdp" ]; then
    return
  fi
  HDP_FOLDER_SIZE=$(ls /usr/hdp/ | grep "[[:digit:]].[[:digit:]].[[:digit:]].[[:digit:]]-[[:digit:]]" | wc -l)
  # cannot find a unique hdp dir. 
  if [ ! ${HDP_FOLDER_SIZE} -eq 1 ]; then
    return
  fi
  # check which spark dir is using
  if [ -d "/usr/hdp/current/spark" ] || [ -d "/usr/hdp/current/spark2" ] || [ -d "/usr/hdp/current/spark-client" ] || [ -d "/usr/hdp/current/spark2-client" ] ; then
    SPARK_VERSION=$("${SPARK_BIN}" --version 2>&1 | grep "version[[:space:]][[:digit:]]" | sed "s/^.*version[[:space:]]*//")
    if [[ "${SPARK_VERSION}" < "2.0.0" ]]; then
      if [ -d "/usr/hdp/current/spark" ]; then
        echo "/usr/hdp/current/spark"
      elif [ -d "/usr/hdp/current/spark-client" ]; then
        echo "/usr/hdp/current/spark-client"
      fi
    else
      if [ -d "/usr/hdp/current/spark2" ]; then
        echo "/usr/hdp/current/spark2"
      elif [ -d "/usr/hdp/current/spark2-client" ]; then
        echo "/usr/hdp/current/spark2-client"
      fi
    fi
  fi
}

function find_cdh_spark_home()
{
  baseDir=$1
  SPARK_VERSION=$("${SPARK_BIN}" --version 2>&1 | grep "version[[:space:]][[:digit:]]" | sed "s/^.*version[[:space:]]*//")
  if [[ "${SPARK_VERSION}" < "2.0.0" ]]; then
    echo "${baseDir}/lib/spark"
  else
    echo "${baseDir}/lib/spark2"
  fi
}

function get_spark_home()
{
  if [ -z "${SPARK_BIN}" ] ; then
    report_no_spark_home_error
  fi

  SPARK_BIN_FILE=$(readlink -f ${SPARK_BIN})
  SPARK_HOME_DIR=$(dirname $(dirname ${SPARK_BIN_FILE}))

  # for cloudera, need special logic to decide whether it is the correct dir
  if [[ "${SPARK_HOME_DIR}" == *"cloudera"* ]] && [ ! -f "${SPARK_HOME_DIR}/bin/spark-class" ] ; then
    SPARK_HOME_DIR=$(find_cdh_spark_home "${SPARK_HOME_DIR}")
    if [ ! -f "${SPARK_HOME_DIR}/bin/spark-class" ] ; then
        report_no_spark_home_error
    fi
  fi

  # if fail to find the correct spark home dir from spark-submit script(some hdp clusters don't use symlink). Try to make a guess for hdp.
  if [ ! -d "${SPARK_HOME_DIR}/conf" ] && [ ! -d "${SPARK_HOME_DIR}/jars" ] && [ -d "/usr/hdp" ] ; then
    SPARK_HOME_DIR=$(find_hdp_spark_home)
    if [ ! -d "${SPARK_HOME_DIR}/conf" ] && [ ! -d "${SPARK_HOME_DIR}/jars" ] ; then
        report_no_spark_home_error
    fi
  fi

  echo "%PROTOCOL-STDOUT%"
  echo "${SPARK_HOME_DIR}"
  echo "%PROTOCOL-STDERR%"
  echo "%PROTOCOL-END%"
}

function get_spark_version()
{
  if [ -z "${SPARK_BIN}" ] ; then
    report_no_spark_version_error
  fi
  SPARK_VERSION=$("${SPARK_BIN}" --version 2>&1 | grep "version[[:space:]][[:digit:]]" | sed "s/^.*version[[:space:]]*//")

  if [ -z "${SPARK_VERSION}" ] ; then
    report_no_spark_version_error
  fi

  echo "%PROTOCOL-STDOUT%"
  echo "${SPARK_VERSION}"
  echo "%PROTOCOL-STDERR%"
  echo "%PROTOCOL-END%"
}

case $command in
  --mrs-home )     get_spark_home
                   ;;
  --mrs-version )  get_spark_version
                   ;;
  --mrs-help )     print_help
                   ;;
  * )              if [ -z "${SPARK_BIN}" ] ; then
                     echo "cannot find the 'spark-submit' script/executable" >&2
                     exit -1
                   fi
                   exec "${SPARK_BIN}" "$@"
                   ;;
esac

