#!/bin/sh

# Copyright (c) 2014-2015 Ganael LAPLANCHE <ganael.laplanche@martymac.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.

# This script is a simple wrapper showing how fpart can be used to migrate data.
# It uses fpart and rsync to spawn multiple rsync instances to migrate data
# from src_dir/ to dst_dir/. Rsync jobs can execute either locally or over SSH.
# This migration is incremental and will need a final rsync pass
# (rsync -av -delete src_dir/ dst_dir/) to remove extra files from dst_dir/.

FPSYNC_VERSION="0.9.2"

########## Default values for options

# Number of sync jobs to run in parallel ("workers", -n)
OPT_JOBS=2
# Same, but autodetected
#OPT_JOBS=$(sysctl -n hw.ncpu)  # On FreeBSD
#OPT_JOBS=$(nproc)              # On Linux
# Number of sync jobs from resumed run, read from the 'info' file
OPT_RJOBS=
# Maximum files per sync job (-f)
OPT_FPMAXPARTFILES="2000"
# Maximum bytes per sync job (-s)
OPT_FPMAXPARTSIZE="$((4 * 1024 * 1024 * 1024))" # 4 GB
# SSH workers (execute jobs locally if not defined, -w)
OPT_WRKRS=""
# Fpart shared dir (must be shared amongst all workers, -d)
OPT_FPSHDIR=""
# Temporary dir (local, used for queue management, -t)
OPT_TMPDIR="/tmp/fpsync"
# Job name for resume mode (-r)
OPT_JOBNAME=""
# Rsync options (-o)
OPT_RSYNC="-av --numeric-ids"
# Fpart options (-O)
OPT_FPART="-x .zfs -x .snapshot* -x .ckpt"
# Sudo mode (-S)
OPT_SUDO=""
# Verbose mode (-v)
OPT_VERBOSE="0"
# Source directory
OPT_SRCDIR=""
# Destination directory
OPT_DSTDIR=""

# Mail - Uncomment to receive a mail when the whole run has finished.
# The master machine (the one running this script) must be able to send mail
# using the 'mail' command. No CLI option is provided to change it.
#OPT_MAIL="address@mydomain.tld"

########## Various functions

# Print help
usage () {
    echo "fpsync v${FPSYNC_VERSION} - Sync directories in parallel using fpart and rsync"
    echo "Copyright (c) 2014-2015 Ganael LAPLANCHE <ganael.laplanche@martymac.org>"
    echo "WWW: http://contribs.martymac.org"
    echo "Usage: $0 [OPTIONS...] /src_dir/ /dst_dir/"
    echo "Usage: $0 [-r jobname] [OPTIONS...]"
    echo "  -n x        start <x> concurrent sync jobs"
    echo "  -f y        transfer at most <y> files per sync job"
    echo "  -s z        transfer at most <z> bytes per sync job"
    echo "  -w wrks     space-separated list of SSH workers"
    echo "              e.g.: -w 'login@host1 login@host2 login@host3'"
    echo "              or:   -w 'login@host1' -w 'login@host2' -w 'login@host3'"
    echo "              Jobs are executed locally if not specified (default)."
    echo "  -d /dir/    set fpsync shared dir to </dir/> (absolute path)"
    echo "              This option is mandatory when using SSH workers."
    echo "  -t /dir/    set fpsync temp dir to </dir/> (absolute path)"
    echo "  -r jobname  resume job <jobname>"
    echo "              (options -f, -s, -o, /src_dir/ and /dst_dir/"
    echo "              are ignored when resuming a previous run)"
    echo "  -o options  override default rsync options with <options>"
    echo "              Use this option with care as certain options are"
    echo "              incompatible with a parallel usage (e.g. --delete)"
    echo "  -O options  override default fpart options with <options>"
    echo "  -S          use sudo for filesystem crawling and synchronizations"
    echo "  -v          verbose mode (default: quiet)"
    echo "              This option can be be specified several times to"
    echo "              increase verbosity level."
    echo "  -h          this help"
    echo "  /src_dir/   source directory (absolute path)"
    echo "  /dst_dir/   destination directory (absolute path)"
    echo "See fpsync(1) for more details."
}

# Print a message to stderr and exit with error code 1
end_die () {
    [ -n "$1" ] && echo "$1" 1>&2
    exit 1
}

# Print (to stdout) and log a message
# $1 = level (0 = quiet, 1 = verbose, >=2 more verbose)
# $2 = message to log
echo_log () {
    is_num "$1" && [ ${OPT_VERBOSE} -ge $1 ] && [ -n "$2" ] && \
        echo "$2"
    [ -n "$2" ] && \
        echo "$2" >> "${FPART_LOGFILE}"
}

# Check if $1 is an absolute path
is_abs_path() {
    echo "$1" | grep -qE "^/"
}

# Check if $1 is a number
is_num () {
    echo "$1" | grep -qE '^[0-9]+$'
}

# Parse user options and initialize OPT_* global variables
parse_opts () {
    local opt OPTARG OPTIND

    while getopts "n:f:s:w:d:t:r:o:O:Svh" opt
    do
        case "${opt}" in
        "n")
            if is_num "${OPTARG}" && [ ${OPTARG} -ge 1 ]
            then
                OPT_JOBS=${OPTARG}
            else
                end_die "Option -n expects a numeric value >= 1"
            fi
            ;;
        "f")
            if is_num "${OPTARG}" && [ ${OPTARG} -gt 0 ]
            then
                OPT_FPMAXPARTFILES=${OPTARG}
            else
                end_die "Option -f expects a numeric value > 0"
            fi
            ;;
        "s")
            if is_num "${OPTARG}" && [ ${OPTARG} -gt 0 ]
            then
                OPT_FPMAXPARTSIZE=${OPTARG}
            else
                end_die "Option -s expects a numeric value > 0"
            fi
            ;;
        "w")
            if [ -n "${OPTARG}" ]
            then
                OPT_WRKRS="${OPT_WRKRS} ${OPTARG}"
            else
                end_die "Invalid workers list supplied"
            fi
            ;;
        "d")
            if is_abs_path "${OPTARG}"
            then
                OPT_FPSHDIR="${OPTARG}"
            else
                end_die "Please supply an absolute path for shared dir"
            fi
            ;;
        "t")
            if is_abs_path "${OPTARG}"
            then
                OPT_TMPDIR="${OPTARG}"
            else
                end_die "Please supply an absolute path for temp dir"
            fi
            ;;
        "r")
            if [ -n "${OPTARG}" ]
            then
                OPT_JOBNAME="${OPTARG}"
            else
                end_die "Invalid job name supplied"
            fi
            ;;
        "o")
            if [ -n "${OPTARG}" ]
            then
                OPT_RSYNC="${OPTARG}"
            else
                end_die "Invalid rsync options supplied"
            fi
            ;;
        "O")
            if [ -n "${OPTARG}" ]
            then
                OPT_FPART="${OPTARG}"
            else
                end_die "Invalid fpart options supplied"
            fi
            ;;
        "S")
            OPT_SUDO="yes"
            ;;
        "v")
            OPT_VERBOSE="$((${OPT_VERBOSE} + 1))"
            ;;
        "h")
            usage
            exit 0
            ;;
        *)
            usage
            end_die "Invalid option specified"
            ;;
        esac
    done
    shift $((${OPTIND} - 1))

    # Validate OPT_FPSHDIR (shared directory)
    if [ -z "${OPT_WRKRS}" ]
    then
        # For local jobs, set shared directory to temporary directory
        [ -z "${OPT_FPSHDIR}" ] && \
            OPT_FPSHDIR="${OPT_TMPDIR}"
    else
        # For remote ones, specifying a shared directory is mandatory
        [ -z "${OPT_FPSHDIR}" ] && \
                end_die "Please supply a shared dir when specifying workers"
    fi

    # Check for src and dst dirs presence and validity
    if [ -z "${OPT_JOBNAME}" ]
    then
        if is_abs_path "$1" && is_abs_path "$2"
        then
            OPT_SRCDIR="$1"
            OPT_DSTDIR="$2"
        else
            usage
            end_die "Please supply an absolute path for both src_dir/ and dst_dir/"
        fi
    fi
}

########## Work-related functions (in-memory, running-jobs handling)

# Initialize WORK_FREEWORKERS by expanding OPT_WRKRS up to OPT_JOBS elements,
# assigning a fixed number of slots to each worker.
# Sanitize OPT_WRKRS if necessary.
work_list_free_workers_init () {
    local _OPT_WRKRS_NUM=$(echo ${OPT_WRKRS} | awk '{print NF}')
    if [ ${_OPT_WRKRS_NUM} -gt 0 ]
    then
        local _i=0
        while [ ${_i} -lt ${OPT_JOBS} ]
        do
            local _OPT_WRKRS_IDX="$((${_i} % ${_OPT_WRKRS_NUM} + 1))"
            WORK_FREEWORKERS="${WORK_FREEWORKERS} $(echo ${OPT_WRKRS} | awk '{print $'${_OPT_WRKRS_IDX}'}')"
            _i=$((${_i} + 1))
        done
    else
        OPT_WRKRS=""
        WORK_FREEWORKERS="local"
    fi
}

# Pick-up next worker
work_list_pick_next_free_worker () {
    echo "${WORK_FREEWORKERS}" | awk '{print $1}'
}

# Remove next worker from list
work_list_trunc_next_free_worker () {
    WORK_FREEWORKERS="$(echo ${WORK_FREEWORKERS} | sed -E 's/^[[:space:]]*[^[:space:]]+[[:space:]]*//')"
}

# Push a work to the list of currently-running ones
work_list_push () {
    if [ -n "$1" ]
    then
        WORK_LIST="${WORK_LIST} $1"
        WORK_NUM="$((${WORK_NUM} + 1))"
    fi
}

# Rebuild the currently-running jobs' list by examining each process' state
work_list_refresh () {
    local _WORK_LIST=""
    local _WORK_NUM=0
    for _JOB in ${WORK_LIST}
    do
        # If the process is still alive, keep it
        if ps "$(echo ${_JOB} | cut -d ':' -f 1)" 1>/dev/null 2>&1
        then
            _WORK_LIST="${_WORK_LIST} ${_JOB}"
            _WORK_NUM="$((${_WORK_NUM} + 1))"
        # If not, put its worker to the free list
        else
            echo_log "2" "<= [QMGR] Job ${_JOB} finished"
            if [ -n "${OPT_WRKRS}" ]
            then
                WORK_FREEWORKERS="${WORK_FREEWORKERS} $(echo ${_JOB} | cut -d ':' -f 2)"
            fi
        fi
    done
    WORK_LIST=${_WORK_LIST}
    WORK_NUM=${_WORK_NUM}
}

########## Jobs-related functions (on-disk, jobs' queue handling)

# Initialize job queue and work directories
job_queue_init () {
    mkdir -p "${JOBS_QUEUEDIR}" 2>/dev/null || \
        end_die "Cannot create job queue directory ${JOBS_QUEUEDIR}"
    mkdir -p "${JOBS_WORKDIR}" 2>/dev/null || \
        end_die "Cannot create job work directory ${JOBS_WORKDIR}"
}

# Dump job queue information to allow later resuming
job_queue_info_dump () {
    # Create "info" file
    local _TMPMASK="$(umask)"
    umask "0077"
    touch "${JOBS_QUEUEDIR}/info" 2>/dev/null
    umask "${_TMPMASK}"

    # Dump information necessary for job resuming
    cat << EOF > "${JOBS_QUEUEDIR}/info" || \
        end_die "Cannot record job information"
# Job information used for resuming, do not edit !
OPT_RJOBS="${OPT_JOBS}"
OPT_SRCDIR="${OPT_SRCDIR}"
OPT_DSTDIR="${OPT_DSTDIR}"
EOF
}

job_queue_info_load () {
    # Source info file and initialize a few variables
    . "${JOBS_QUEUEDIR}/info" || \
        end_die "Cannot read job information"

    # Validate loaded options
    { is_num "${OPT_RJOBS}" && [ ${OPT_RJOBS} -ge 1 ] ;} || \
        end_die "Invalid option value loaded from resumed job: OPT_RJOBS"
    { is_abs_path "${OPT_SRCDIR}" && is_abs_path "${OPT_DSTDIR}" ;} || \
        end_die "Invalid options value loaded from resumed job: OPT_SRCDIR/OPT_DSTDIR"
}

# Set the "fp_done" (fpart done) flag within job queue
job_queue_fp_done () {
    sleep 1 # Ensure this very last file gets created within the next second of
            # last job file's mtime. Necessary for filesystems that don't get
            # below the second for mtime precision (msdosfs).
    touch "${JOBS_QUEUEDIR}/fp_done"
}

# Set the "sl_done" (sync loop done) flag within job queue
job_queue_sl_done () {
    touch "${JOBS_QUEUEDIR}/sl_done"
}

# Set the "sl_stop" (sync loop stop) flag within job queue
job_queue_sl_stop () {
    touch "${JOBS_QUEUEDIR}/sl_stop"
}

# Handle first ^C: stop queue processing by setting the "sl_stop" flag
# then wait for sync jobs to complete and display status before exiting
sigint_handler () {
    SIGINT_COUNT="$((${SIGINT_COUNT} + 1))"
    job_queue_sl_stop
    if [ ${SIGINT_COUNT} -eq 1 ]
    then
        echo_log "1" "===> Interrupted. Waiting for running jobs to complete..."
        echo_log "1" "===> (hit ^C again to kill them and exit)"
        # Wait for queue processing to stop
        wait
        # Display current status before exiting
        [ ${OPT_VERBOSE} -ge 1 ] && siginfo_handler
        # Exit program
        end_die
    fi
}

# Handle subsequent ^C from within job_queue_loop(): kill sync processes
# to fast-unlock the main process (waiting for child processes to exit)
job_queue_loop_sigint_handler () {
    SIGINT_COUNT="$((${SIGINT_COUNT} + 1))"
    if [ ${SIGINT_COUNT} -eq 2 ]
    then
        echo_log "1" "===> Interrupted again, killing remaining jobs"
        for _JOB in ${WORK_LIST}
        do
            kill "$(echo ${_JOB} | cut -d ':' -f 1)" 1>/dev/null 2>&1
        done
        # Wait for child processes to exit and let parent process end_die()
        wait
    fi
}

# Handle ^T: print info about queue status
siginfo_handler () {
    local _jobs_total="$(cd "${FPART_PARTSDIR}" && ls -t1 | wc -l)"
    local _jobs_done="$(cd "${JOBS_WORKDIR}" && ls -t1 | grep -v 'fp_done' | wc -l)"
    local _jobs_percent="??"

    [ ${_jobs_total} -ge 1 ] && \
        _jobs_percent="$(( (${_jobs_done} * 100) / ${_jobs_total} ))"

    # Trim values
    _jobs_total="$(( ${_jobs_total} + 0))"
    _jobs_done="$(( ${_jobs_done} + 0))"

    echo "<=== Parts done: ${_jobs_done}/${_jobs_total} (${_jobs_percent}%), remaining: $((${_jobs_total} - ${_jobs_done}))"
}

# Get next job name relative to ${JOBS_WORKDIR}/
# Returns empty string if no job is available
# JOBS_QUEUEDIR can host several types of file :
# <job_number>: a sync job to perform
# 'info': info file regarding this fpsync run
# 'sl_stop': the 'immediate stop' flag, set when ^C is hit
# 'fp_done': set when fpart has finished crawling src_dir/ and generated jobs
# 'sl_done': set when job_queue_loop() terminates (either normally or stopped)
job_queue_next () {
    local _NEXT=""
    if [ -f "${JOBS_QUEUEDIR}/sl_stop" ]
    then
        echo "sl_stop"
    else
        _NEXT=$(cd "${JOBS_QUEUEDIR}" && ls -rt1 | grep -v "info" | head -n 1)
        if [ -n "${_NEXT}" ]
        then
            mv "${JOBS_QUEUEDIR}/${_NEXT}" "${JOBS_WORKDIR}" || \
                end_die "Cannot dequeue next job"
            echo "${_NEXT}"
        fi
    fi
}

# Main jobs' loop: pick up jobs within the queue directory and start them
job_queue_loop () {
    echo_log "2" "===> [QMGR] Starting queue manager"

    # Trap SIGINT
    trap 'job_queue_loop_sigint_handler' 2
    # Ignore SIGINFO from within loop, handled by the parent (master) process
    trap '' 29

    local _NEXT=""
    while [ "${_NEXT}" != "fp_done" ] && [ "${_NEXT}" != "sl_stop" ]
    do
        local _PID=""
        if [ ${WORK_NUM} -lt ${OPT_JOBS} ]
        then
            _NEXT="$(job_queue_next)"
            if [ -n "${_NEXT}" ] && \
                [ "${_NEXT}" != "fp_done" ] && \
                [ "${_NEXT}" != "sl_stop" ]
            then
                if [ -z "${OPT_WRKRS}" ]
                then
                    echo_log "2" "=> [QMGR] Starting job ${JOBS_WORKDIR}/${_NEXT} (local)"
                    /bin/sh "${JOBS_WORKDIR}/${_NEXT}" &
                    work_list_push "$!:local"
                else
                    local _NEXT_HOST="$(work_list_pick_next_free_worker)"
                    work_list_trunc_next_free_worker
                    echo_log "2" "=> [QMGR] Starting job ${JOBS_WORKDIR}/${_NEXT} -> ${_NEXT_HOST}"
                    "${SSH_BIN}" "${_NEXT_HOST}" '/bin/sh -s' \
                        < "${JOBS_WORKDIR}/${_NEXT}" &
                    work_list_push "$!:${_NEXT_HOST}"
                fi
            fi
        fi
        work_list_refresh
        sleep 0.2
    done

    if [ "${_NEXT}" = "fp_done" ]
    then
        echo_log "2" "<=== [QMGR] Done submitting jobs. Waiting for them to finish."
    else
        echo_log "2" "<=== [QMGR] Stopped. Waiting for jobs to finish."
    fi
    wait
    echo_log "2" "<=== [QMGR] Queue processed"

    # Set the 'sl_done' (sync done) flag to let the master process go
    job_queue_sl_done
}

########## Program start (main() !)

# Parse command-line options
parse_opts "$@"

## Options' post-processing section

# Job name initialization
if [ -n "${OPT_JOBNAME}" ]
then
    # Resume mode, check if OPT_JOBNAME exists
    if [ -d "${OPT_TMPDIR}/queue/${OPT_JOBNAME}" ] && \
        [ -d "${OPT_TMPDIR}/work/${OPT_JOBNAME}" ]
    then
        FPART_JOBNAME="${OPT_JOBNAME}"
    else
        end_die "Could not find specified job's queue and work directories"
    fi
else
    # Generate a unique job name. This job name *must* remain
    # unique from one job to another.
    FPART_JOBNAME="$(echo ${OPT_DSTDIR} | LC_ALL=C tr -dc '/a-zA-Z0-9' | sed -E -e 's|/*$||' -e 's|^.*/([^/]+)$|\1|' | head -c 16)-$(date '+%s')-$$"
fi

# Queue manager configuration. This queue remains local, even when using SSH.
JOBS_QUEUEDIR="${OPT_TMPDIR}/queue/${FPART_JOBNAME}"   # Queue dir.
JOBS_WORKDIR="${OPT_TMPDIR}/work/${FPART_JOBNAME}"     # Current jobs' dir.

# Paths to executables that must exist locally
FPART_BIN="$(which fpart)"
SSH_BIN="$(which ssh)"
MAIL_BIN="$(which mail)"

# Paths to executables that must exist both locally and remotely
SUDO_BIN="$(which sudo)"

# Paths to executables that must exist either locally or remotely (depending
# on if you use SSH or not). When using SSH, the following binaries must be
# present at those paths on each worker.
RSYNC_BIN="$(which rsync)"

# Do we need sudo ?
if [ -n "${OPT_SUDO}" ]
then
    SUDO="${SUDO_BIN}"
else
    SUDO=""
fi

# Fpart paths. Those ones must be shared amongst all nodes when using SSH
# (e.g. through a NFS share mounted on *every* single node, including the master
# 'job submitter').
FPART_PARTSDIR="${OPT_FPSHDIR}/parts/${FPART_JOBNAME}"
FPART_PARTSTMPL="${FPART_PARTSDIR}/part"
FPART_LOGDIR="${OPT_FPSHDIR}/log/${FPART_JOBNAME}"
FPART_LOGFILE="${FPART_LOGDIR}/fpart.log"

# Fpart hooks (black magic is here !)
FPART_COMMAND="/bin/sh -c '${SUDO} ${RSYNC_BIN} ${OPT_RSYNC} \
        --files-from=\\\"\${FPART_PARTFILENAME}\\\" \
        \\\"${OPT_SRCDIR}/\\\" \
        \\\"${OPT_DSTDIR}/\\\"' \
        1>\"${FPART_LOGDIR}/\${FPART_PARTNUMBER}.stdout\" \
        2>\"${FPART_LOGDIR}/\${FPART_PARTNUMBER}.stderr\""
FPART_POSTHOOK="echo \"${FPART_COMMAND}\" > \
        \"${JOBS_QUEUEDIR}/\${FPART_PARTNUMBER}\" && \
        [ ${OPT_VERBOSE} -ge 2 ] && \
        echo \"=> [FPART] Partition \${FPART_PARTNUMBER} written\"" # [1]

# [1] Be careful to host the job queue on a filesystem that can handle
# fine-grained mtime timestamps (i.e. with a sub-second precision) if you want
# the queue to be processed in order when fpart generates several job files per
# second.
# On FreeBSD, vfs timestamps' precision can be tuned using the
# vfs.timestamp_precision sysctl. See vfs_timestamp(9).

## End of options' post-processing section, let's start for real now !

SIGINT_COUNT="0"    # ^C counter

WORK_NUM=0          # Current number of running processes
WORK_LIST=""        # Work PID[:WORKER] list
WORK_FREEWORKERS="" # Free workers' list

# Check for essential binaries
[ ! -x "${FPART_BIN}" ] && \
    end_die "Fpart is missing locally, check your configuration"
[ -n "${OPT_WRKRS}" ] && [ ! -x "${SSH_BIN}" ] && \
    end_die "SSH is missing locally, check your configuration"
[ -n "${OPT_MAIL}" ] && [ ! -x "${MAIL_BIN}" ] && \
    end_die "Mail is missing locally, check your configuration"
[ -n "${OPT_SUDO}" ] && [ ! -x "${SUDO}" ] && \
    end_die "Sudo is missing locally, check your configuration"

# Create / check for fpart shared directories
if [ -z "${OPT_JOBNAME}" ]
then
    # For a new job, create those directories
    mkdir -p "${FPART_PARTSDIR}" 2>/dev/null || \
        end_die "Cannot create partitions' output directory: ${FPART_PARTSDIR}"
    mkdir -p "${FPART_LOGDIR}" 2>/dev/null || \
        end_die "Cannot create log directory: ${FPART_LOGDIR}"
else
    # In resume mode, FPART_PARTSDIR and FPART_LOGDIR must already exist
    if [ ! -d "${FPART_PARTSDIR}" ] || [ ! -d "${FPART_LOGDIR}" ]
    then
        end_die "Could not find specified job's 'parts' and 'log' directories"
    fi
fi

# Create or update log file
touch "${FPART_LOGFILE}" 2>/dev/null || \
    end_die "Cannot create log file: ${FPART_LOGFILE}"

# Create / check for job and work queues
if [ -z "${OPT_JOBNAME}" ]
then
    # For a new job, create those directories
    job_queue_init
else
    # When resuming a job, check if :
    # - the last fpart pass has completed
    #   (the 'fp_done' flag is present)
    # - the last fpsync pass has *not* completed
    #   (the 'fp_done' flag is *still* present)
    # - we can get the number of workers previously implied
    #   (the 'info' flag is present)
    # - the work queue exists
    if [ ! -f "${JOBS_QUEUEDIR}/fp_done" ] || [ ! -f "${JOBS_QUEUEDIR}/info" ]
    then
        end_die "Specified job is not resumable ('fp_done' or 'info' flag missing)"
    fi
    [ ! -d "${JOBS_WORKDIR}" ] && \
        end_die "Specified job is not resumable (work queue missing)"

    # Job is resumable, try to reload job options and prepare queues
    job_queue_info_load
    # Remove the "sl_stop" and "sl_done" flags, if any
    rm -f "${JOBS_QUEUEDIR}/sl_stop" 2>/dev/null
    rm -f "${JOBS_QUEUEDIR}/sl_done" 2>/dev/null
    # Move potentially-incomplete jobs to the jobs queue so that they can be
    # executed again. We consider the worst-case scenario and resume OPT_RJOBS
    # last jobs, some of them being partially finished.
    for _file in \
        $(cd "${JOBS_WORKDIR}" && ls -t1 | head -n "${OPT_RJOBS}")
    do
        mv "${JOBS_WORKDIR}/${_file}" "${JOBS_QUEUEDIR}" 2>/dev/null ||
            end_die "Cannot resume specified job"
    done
fi

# Validate src_dir/ locally (needed for fpart) for first runs or local ones
if [ -z "${OPT_JOBNAME}" ] || [ -z "${OPT_WRKRS}" ]
then
    [ ! -d "${OPT_SRCDIR}" ] && \
        end_die "Source directory does not exist (or is not a directory): ${OPT_SRCDIR}"
fi

# When using SSH, validate src_dir/ and dst_dir/ remotely and check for rsync
# presence (this also allows checking SSH connectivity to each declared host)
if [ -n "${OPT_WRKRS}" ]
then
    echo_log "2" "=====> Validating requirements on SSH nodes..."

    _FIRST_HOST="$(echo ${OPT_WRKRS} | awk '{print $1}')"
    for _host in ${OPT_WRKRS}
    do
        # Check for sudo presence (it must be passwordless)
        if [ -n "${OPT_SUDO}" ]
        then
            "${SSH_BIN}" "${_host}" "${SUDO} /bin/sh -c ':' 2>/dev/null" || \
                end_die "Sudo executable not found or requires password on target ${_host}"
        fi

        # Blindly try to create dst_dir/ as well as a witness file on the first
        # node. Using a witness file will allow us to check for its
        # presence/visibility from other nodes, avoiding "split-brain"
        # situations where dst_dir/ exists but is not shared amongst all nodes
        # (typically a local mount point where the shared storage area *should*
        # be mounted but isn't, for any reason).
        [ "${_host}" = "${_FIRST_HOST}" ] && \
            "${SSH_BIN}" "${_host}" "/bin/sh -c 'mkdir -p \"${OPT_DSTDIR}\" && \
                ${SUDO} touch \"${OPT_DSTDIR}/${FPART_JOBNAME}\"' 2>/dev/null"

        # Check for src_dir/ and dst_dir/ (witness file)
        "${SSH_BIN}" "${_host}" "/bin/sh -c '[ -d \"${OPT_SRCDIR}\" ]'" || \
            end_die "Source directory does not exist on target ${_host} (or is not a directory): ${OPT_SRCDIR}"
        "${SSH_BIN}" "${_host}" "/bin/sh -c '[ -f \"${OPT_DSTDIR}/${FPART_JOBNAME}\" ]'" || \
            end_die "Destination directory (shared) is not available on target ${_host}: ${OPT_DSTDIR}"

        # Finally, check for rsync presence
        "${SSH_BIN}" "${_host}" "/bin/sh -c '[ -x \"${RSYNC_BIN}\" ]'" || \
            end_die "Rsync executable not found on target ${_host}"
        echo_log "2" "<=== ${_host}: OK"
    done

    # Remove witness file
    "${SSH_BIN}" "${_FIRST_HOST}" \
        "/bin/sh -c '${SUDO} rm -f \"${OPT_DSTDIR}/${FPART_JOBNAME}\"' 2>/dev/null"
    unset _FIRST_HOST
else
    # Local usage - create dst_dir/ and check for rsync presence
    if [ ! -d "${OPT_DSTDIR}" ]
    then
        mkdir -p "${OPT_DSTDIR}" 2>/dev/null || \
            end_die "Cannot create destination directory: ${OPT_DSTDIR}"
    fi
    [ ! -x "${RSYNC_BIN}" ] && \
        end_die "Rsync is missing locally, check your configuration"
fi

# Dispatch OPT_WRKRS into WORK_FREEWORKERS
work_list_free_workers_init

# Let's rock !
echo_log "2" "=====> [$$] Syncing ${OPT_SRCDIR} => ${OPT_DSTDIR}"
echo_log "1" "===> Job name: ${FPART_JOBNAME}$([ -n "${OPT_JOBNAME}" ] && echo ' (resumed)')"
echo_log "2" "===> Start time: $(date)"
echo_log "2" "===> Concurrent sync jobs: ${OPT_JOBS}"
echo_log "2" "===> Workers: $(echo "${OPT_WRKRS}" | sed -E -e 's/^[[:space:]]+//' -e 's/[[:space:]]+/ /g')$([ -z "${OPT_WRKRS}" ] && echo 'local')"
echo_log "2" "===> Shared dir: ${OPT_FPSHDIR}"
echo_log "2" "===> Temp dir: ${OPT_TMPDIR}"
# The following options are ignored when resuming
if [ -z "${OPT_JOBNAME}" ]
then
    echo_log "2" "===> Max files per sync job: ${OPT_FPMAXPARTFILES}"
    echo_log "2" "===> Max bytes per sync job: ${OPT_FPMAXPARTSIZE}"
    echo_log "2" "===> Rsync options: \"${OPT_RSYNC}\""
fi

# Record job information
job_queue_info_dump

# Set SIGINT and SIGINFO traps and start job_queue_loop
trap 'sigint_handler' 2
trap 'siginfo_handler' 29
echo_log "2" "===> Use ^C to abort, ^T (SIGINFO) to display status"
job_queue_loop&

# When not resuming a previous job, start fpart
if [ -z "${OPT_JOBNAME}" ]
then
    echo_log "1" "===> Analyzing filesystem..."
    # Start fpart from src_dir/ directory and produce jobs within
    # ${JOBS_QUEUEDIR}/
    cd "${OPT_SRCDIR}" && \
        ${SUDO} "${FPART_BIN}" \
        -f "${OPT_FPMAXPARTFILES}" \
        -s "${OPT_FPMAXPARTSIZE}" \
        -o "${FPART_PARTSTMPL}" ${OPT_FPART} -Z -L \
        -W "${FPART_POSTHOOK}" . 2>&1 | tee -a "${FPART_LOGFILE}"
fi

# Tell job_queue_loop that crawling has finished
job_queue_fp_done

# Wait for job_queue_loop to terminate
# Use an active wait to allow signal processing (^T)
echo_log "1" "===> Waiting for sync jobs to complete..."
while [ ! -f "${JOBS_QUEUEDIR}/sl_done" ]
do
    sleep 0.2
done

# Display final status
[ ${OPT_VERBOSE} -ge 1 ] && siginfo_handler

# Examine results and send an e-mail if requested
RET=$(find "${FPART_LOGDIR}/" -name "*.stderr" ! -size 0)
MSG=$( { [ -z "${RET}" ] && echo 'Fpsync completed without error.' ;} || \
    { echo "Fpsync completed with errors, see logs:" && echo "${RET}" ;} )
if [ -n "${OPT_MAIL}" ]
then
    echo "${MSG}" | ${MAIL_BIN} -s "Fpsync job ${FPART_JOBNAME}" "${OPT_MAIL}"
fi
echo_log "1" "<=== ${MSG}"
echo_log "2" "<=== End time: $(date)"

[ -n "${RET}" ] && exit 1
exit 0
