Source code for wbia.web.job_engine

# -*- coding: utf-8 -*-
"""
Accepts and handles requests for tasks.

Each of the following runs in its own Thread/Process.

BASICALLY DO A CLIENT/SERVER TO SPAWN PROCESSES
AND THEN A PUBLISH SUBSCRIBE TO RETURN DATA

Accepter:
    Receives tasks and requests
    Delegates tasks and responds to requests
    Tasks are delgated to an engine

Engine:
    the engine accepts requests.
    the engine immediately responds WHERE it will be ready.
    the engine sends a message to the collector saying that something will be ready.
    the engine then executes a task.
    The engine is given direct access to the data.

Collector:
    The collector accepts requests
    The collector can respond:
    * <ResultContent>
    * Results are ready.
    * Results are not ready.
    * Unknown jobid.
    * Error computing results.
    * Progress percent.

References:
    Simple task farm, with routed replies in pyzmq
    http://stackoverflow.com/questions/7809200/implementing-task-farm-messaging-pattern-with-zeromq
    https://gist.github.com/minrk/1358832

Notes:
    We are essentially goint to be spawning two processes.
    We can test these simultaniously using

    python -m wbia.web.job_engine job_engine_tester

    We can test these separately by first starting the background server
    python -m wbia.web.job_engine job_engine_tester --bg

    Alternative:
    python -m wbia.web.job_engine job_engine_tester --bg --no-engine
    python -m wbia.web.job_engine job_engine_tester --bg --only-engine --fg-engine

    And then running the forground process
    python -m wbia.web.job_engine job_engine_tester --fg
"""
# if False:
#    import os
#    os.environ['UTOOL_NOCNN'] = 'True'
# import logging
import utool as ut
import time
import zmq
import uuid  # NOQA
import numpy as np
import shelve
import random
from datetime import datetime, timedelta
import pytz
import flask
from os.path import join, exists, abspath, splitext, basename
from functools import partial
from wbia.control import controller_inject
from wbia.utils import call_houston
import multiprocessing


print, rrr, profile = ut.inject2(__name__)  # NOQA
# logger = logging.getLogger('wbia')


CLASS_INJECT_KEY, register_ibs_method = controller_inject.make_ibs_register_decorator(
    __name__
)
register_api = controller_inject.get_wbia_flask_api(__name__)

ctx = zmq.Context.instance()

# FIXME: needs to use correct number of ports
URL = 'tcp://127.0.0.1'
NUM_DEFAULT_ENGINES = ut.get_argval('--engine-lane-workers', int, 2)
NUM_SLOW_ENGINES = ut.get_argval('--engine-slow-lane-workers', int, NUM_DEFAULT_ENGINES)
NUM_FAST_ENGINES = ut.get_argval('--engine-fast-lane-workers', int, NUM_DEFAULT_ENGINES)
NUM_ENGINES = {
    'slow': NUM_SLOW_ENGINES,
    'fast': NUM_FAST_ENGINES,
}
# VERBOSE_JOBS = (
#     ut.get_argflag('--bg') or ut.get_argflag('--fg') or ut.get_argflag('--verbose-jobs')
# )
VERBOSE_JOBS = False


GLOBAL_SHELVE_LOCK = multiprocessing.Lock()


TIMESTAMP_FMTSTR = '%Y-%m-%d %H:%M:%S %Z'
TIMESTAMP_TIMEZONE = 'US/Pacific'


JOB_STATUS_CACHE = {}


[docs]def update_proctitle(procname, dbname=None):
    try:
        import setproctitle

        print('CHANGING PROCESS TITLE')
        old_title = setproctitle.getproctitle()
        print('old_title = %r' % (old_title,))
        hostname = ut.get_computer_name()
        new_title = 'WBIA_%s_%s_%s' % (dbname, hostname, procname)
        print('new_title = %r' % (new_title,))
        setproctitle.setproctitle(new_title)
    except ImportError:
        print('pip install setproctitle')


def _get_engine_job_paths(ibs):
    shelve_path = ibs.get_shelves_path()
    ut.ensuredir(shelve_path)
    record_filepath_list = list(ut.iglob(join(shelve_path, '*.pkl')))
    return record_filepath_list


def _get_engine_lock_paths(ibs):
    shelve_path = ibs.get_shelves_path()
    ut.ensuredir(shelve_path)
    lock_filepath_list = list(ut.iglob(join(shelve_path, '*.lock')))
    return lock_filepath_list


[docs]@register_ibs_method
def retry_job(ibs, jobid):
    from os.path import exists

    shelve_path = ibs.get_shelves_path()
    job_record_filename = '%s.pkl' % (jobid,)
    job_record_filepath = join(shelve_path, job_record_filename)
    assert exists(job_record_filepath)

    job_record = ut.load_cPkl(job_record_filepath)

    job_action = job_record['request']['action']
    job_args = job_record['request']['args']
    job_kwargs = job_record['request']['kwargs']

    job_func = getattr(ibs, job_action, None)
    if job_func is not None:
        job_result = job_func(*job_args, **job_kwargs)

    return job_action, job_func, job_args, job_kwargs, job_result


[docs]@register_ibs_method
def initialize_job_manager(ibs):
    """
    Starts a background zmq job engine. Initializes a zmq object in this thread
    that can talk to the background processes.

    Run from the webserver

    CommandLine:
        python -m wbia.web.job_engine --exec-initialize_job_manager:0

    Example:
        >>> # DISABLE_DOCTEST
        >>> # xdoctest: +REQUIRES(--job-engine-tests)
        >>> from wbia.web.job_engine import *  # NOQA
        >>> import wbia
        >>> ibs = wbia.opendb(defaultdb='testdb1')
        >>> from wbia.web import apis_engine
        >>> from wbia.web import job_engine
        >>> ibs.load_plugin_module(job_engine)
        >>> ibs.load_plugin_module(apis_engine)
        >>> ibs.initialize_job_manager()
        >>> print('Initializqation success. Now closing')
        >>> ibs.close_job_manager()
        >>> print('Closing success.')

    """
    ibs.job_manager = ut.DynStruct()

    use_static_ports = False

    if ut.get_argflag('--web-deterministic-ports'):
        use_static_ports = True

    if ut.get_argflag('--fg'):
        ibs.job_manager.reciever = JobBackend(use_static_ports=True)
    else:
        ibs.job_manager.reciever = JobBackend(use_static_ports=use_static_ports)
        ibs.job_manager.reciever.initialize_background_processes(
            dbdir=ibs.get_dbdir(), containerized=ibs.containerized
        )

    # Delete any leftover locks from before
    lock_filepath_list = _get_engine_lock_paths(ibs)
    print('Deleting %d leftover engine locks' % (len(lock_filepath_list),))
    for lock_filepath in lock_filepath_list:
        ut.delete(lock_filepath)

    ibs.job_manager.jobiface = JobInterface(
        0, ibs.job_manager.reciever.port_dict, ibs=ibs
    )
    ibs.job_manager.jobiface.initialize_client_thread()
    # Wait until the collector becomes live
    while 0 and True:
        result = ibs.get_job_status(-1)
        print('result = %r' % (result,))
        if result['status'] == 'ok':
            break

    ibs.job_manager.jobiface.queue_interrupted_jobs()

    # import wbia
    # #dbdir = '/media/raid/work/testdb1'
    # from wbia.web import app
    # web_port = ibs.get_web_port_via_scan()
    # if web_port is None:
    #     raise ValueError('IA web server is not running on any expected port')
    # proc = ut.spawn_background_process(app.start_from_wbia, ibs, port=web_port)


[docs]@register_ibs_method
def close_job_manager(ibs):
    # if hasattr(ibs, 'job_manager') and ibs.job_manager is not None:
    #     pass
    del ibs.job_manager.reciever
    del ibs.job_manager.jobiface
    ibs.job_manager = None


[docs]@register_ibs_method
@register_api('/api/engine/job/', methods=['GET', 'POST'], __api_plural_check__=False)
def get_job_id_list(ibs):
    """
    Web call that returns the list of job ids

    CommandLine:
        # Run Everything together
        python -m wbia.web.job_engine --exec-get_job_status

        # Start job queue in its own process
        python -m wbia.web.job_engine job_engine_tester --bg
        # Start web server in its own process
        ./main.py --web --fg
        pass
        # Run foreground process
        python -m wbia.web.job_engine --exec-get_job_status:0 --fg

    Example:
        >>> # xdoctest: +REQUIRES(--web-tests)
        >>> # xdoctest: +REQUIRES(--job-engine-tests)
        >>> from wbia.web.job_engine import *  # NOQA
        >>> import wbia
        >>> with wbia.opendb_bg_web('testdb1', managed=True) as web_ibs:  # , domain='http://52.33.105.88')
        ...     # Test get status of a job id that does not exist
        ...     response = web_ibs.send_wbia_request('/api/engine/job/', jobid='badjob')

    """
    status = ibs.job_manager.jobiface.get_job_id_list()
    jobid_list = status['jobid_list']
    return jobid_list


[docs]@register_ibs_method
@register_api(
    '/api/engine/process/status/', methods=['GET', 'POST'], __api_plural_check__=False
)
def get_process_alive_status(ibs):
    status_dict = ibs.job_manager.reciever.get_process_alive_status()
    print('status_dict = %r' % (status_dict,))
    return status_dict


[docs]@register_ibs_method
@register_api(
    '/api/engine/job/status/', methods=['GET', 'POST'], __api_plural_check__=False
)
def get_job_status(ibs, jobid=None):
    """
    Web call that returns the status of a job

    Returns one of:
        received              - job has been received, but not ingested yet
        accepted              - job has been accepted (validated)
        queued                - job has been transferred to the engine queue
        working               - job is being worked on by the engine
        publishing            - job is done on the engine, pushing results to collector
        completed | exception - job is complete or has an error

    CommandLine:
        # Run Everything together
        python -m wbia.web.job_engine --exec-get_job_status

        # Start job queue in its own process
        python -m wbia.web.job_engine job_engine_tester --bg
        # Start web server in its own process
        ./main.py --web --fg
        pass
        # Run foreground process
        python -m wbia.web.job_engine --exec-get_job_status:0 --fg

    Example:
        >>> # xdoctest: +REQUIRES(--web-tests)
        >>> # xdoctest: +REQUIRES(--job-engine-tests)
        >>> from wbia.web.job_engine import *  # NOQA
        >>> import wbia
        >>> with wbia.opendb_bg_web('testdb1', managed=True) as web_ibs:  # , domain='http://52.33.105.88')
        ...     # Test get status of a job id that does not exist
        ...     response = web_ibs.send_wbia_request('/api/engine/job/status/', jobid='badjob')

    """
    if jobid is None:
        status = ibs.job_manager.jobiface.get_job_status_dict()
    else:
        status = ibs.job_manager.jobiface.get_job_status(jobid)
    return status


# @register_ibs_method
# @register_api('/api/engine/job/terminate/', methods=['GET', 'POST'])
# def send_job_terminate(ibs, jobid):
#     """
#     Web call that terminates a job
#     """
#     success = ibs.job_manager.jobiface.terminate_job(jobid)
#     return success


[docs]@register_ibs_method
@register_api(
    '/api/engine/job/metadata/', methods=['GET', 'POST'], __api_plural_check__=False
)
def get_job_metadata(ibs, jobid):
    """
    Web call that returns the metadata of a job

    CommandLine:
        # Run Everything together
        python -m wbia.web.job_engine --exec-get_job_metadata

        # Start job queue in its own process
        python -m wbia.web.job_engine job_engine_tester --bg
        # Start web server in its own process
        ./main.py --web --fg
        pass
        # Run foreground process
        python -m wbia.web.job_engine --exec-get_job_metadata:0 --fg

    Example:
        >>> # xdoctest: +REQUIRES(--web-tests)
        >>> # xdoctest: +REQUIRES(--slow)
        >>> # xdoctest: +REQUIRES(--job-engine-tests)
        >>> # xdoctest: +REQUIRES(--web-tests)
        >>> from wbia.web.job_engine import *  # NOQA
        >>> import wbia
        >>> with wbia.opendb_bg_web('testdb1', managed=True) as web_ibs:  # , domain='http://52.33.105.88')
        ...     # Test get metadata of a job id that does not exist
        ...     response = web_ibs.send_wbia_request('/api/engine/job/metadata/', jobid='badjob')

    """
    status = ibs.job_manager.jobiface.get_job_metadata(jobid)
    return status


[docs]@register_ibs_method
@register_api('/api/engine/job/result/', methods=['GET', 'POST'])
def get_job_result(ibs, jobid):
    """
    Web call that returns the result of a job
    """
    result = ibs.job_manager.jobiface.get_job_result(jobid)
    return result


[docs]@register_ibs_method
@register_api('/api/engine/job/result/wait/', methods=['GET', 'POST'])
def wait_for_job_result(ibs, jobid, timeout=10, freq=0.1):
    ibs.job_manager.jobiface.wait_for_job_result(jobid, timeout=timeout, freq=freq)
    result = ibs.job_manager.jobiface.get_unpacked_result(jobid)
    return result


def _get_random_open_port():
    port = random.randint(1024, 49151)
    while not ut.is_local_port_open(port):
        port = random.randint(1024, 49151)
    assert ut.is_local_port_open(port)
    return port


[docs]def job_engine_tester():
    """
    CommandLine:
        python -m wbia.web.job_engine --exec-job_engine_tester
        python -b -m wbia.web.job_engine --exec-job_engine_tester

        python -m wbia.web.job_engine job_engine_tester
        python -m wbia.web.job_engine job_engine_tester --bg
        python -m wbia.web.job_engine job_engine_tester --fg

    Example:
        >>> # SCRIPT
        >>> from wbia.web.job_engine import *  # NOQA
        >>> job_engine_tester()
    """
    _init_signals()
    # now start a few clients, and fire off some requests
    client_id = np.random.randint(1000)
    reciever = JobBackend(use_static_ports=True)
    jobiface = JobInterface(client_id, reciever.port_dict)
    from wbia.init import sysres

    if ut.get_argflag('--bg'):
        dbdir = sysres.get_args_dbdir(
            defaultdb='cache', allow_newdir=False, db=None, dbdir=None
        )
        reciever.initialize_background_processes(dbdir)
        print('[testzmq] parent process is looping forever')
        while True:
            time.sleep(1)
    elif ut.get_argflag('--fg'):
        jobiface.initialize_client_thread()
    else:
        dbdir = sysres.get_args_dbdir(
            defaultdb='cache', allow_newdir=False, db=None, dbdir=None
        )
        reciever.initialize_background_processes(dbdir)
        jobiface.initialize_client_thread()

    # Foreground test script
    print('... waiting for jobs')
    if ut.get_argflag('--cmd'):
        ut.embed()
        # jobiface.queue_job()
    else:
        print('[test] ... emit test1')
        callback_url = None
        callback_method = None
        args = (1,)
        jobid1 = jobiface.queue_job(
            action='helloworld',
            callback_url=callback_url,
            callback_method=callback_method,
            args=args,
        )
        jobiface.wait_for_job_result(jobid1)
        jobid_list = []

        args = ([1], [3, 4, 5])
        kwargs = dict(cfgdict={'K': 1})
        identify_jobid = jobiface.queue_job(
            action='query_chips_simple_dict',
            callback_url=callback_url,
            callback_method=callback_method,
            args=args,
            kwargs=kwargs,
        )
        for jobid in jobid_list:
            jobiface.wait_for_job_result(jobid)

        jobiface.wait_for_job_result(identify_jobid)
    print('FINISHED TEST SCRIPT')


[docs]def spawn_background_process(func, *args, **kwargs):
    import utool as ut

    func_name = ut.get_funcname(func)
    name = 'job-engine.Progress-' + func_name
    proc_obj = multiprocessing.Process(target=func, name=name, args=args, kwargs=kwargs)
    proc_obj.daemon = True
    proc_obj.start()
    return proc_obj


[docs]class JobBackend(object):
    def __init__(self, **kwargs):
        # self.num_engines = 3
        self.engine_queue_proc = None
        self.engine_lanes = ['fast', 'slow']

        self.engine_lanes = [lane.lower() for lane in self.engine_lanes]
        assert 'slow' in self.engine_lanes

        self.num_engines = {lane: NUM_ENGINES[lane] for lane in self.engine_lanes}
        self.engine_procs = None
        self.collect_queue_proc = None
        self.collect_proc = None
        # --
        only_engine = ut.get_argflag('--only-engine')
        self.spawn_collector = not only_engine
        self.spawn_engine = not ut.get_argflag('--no-engine')
        self.fg_engine = ut.get_argflag('--fg-engine')
        self.spawn_queue = not only_engine
        # Find ports
        self.port_dict = None
        self._initialize_job_ports(**kwargs)
        print('JobBackend ports:')
        ut.print_dict(self.port_dict)

    def __del__(self):
        if VERBOSE_JOBS:
            print('Cleaning up job backend')
        if self.engine_procs is not None:
            for lane in self.engine_procs:
                for engine in self.engine_procs[lane]:
                    try:
                        engine.terminate()
                    except Exception:
                        pass
        if self.engine_queue_proc is not None:
            try:
                self.engine_queue_proc.terminate()
            except Exception:
                pass
        if self.collect_proc is not None:
            try:
                self.collect_proc.terminate()
            except Exception:
                pass
        if self.collect_queue_proc is not None:
            try:
                self.collect_queue_proc.terminate()
            except Exception:
                pass
        if VERBOSE_JOBS:
            print('Killed external procs')

    def _initialize_job_ports(self, use_static_ports=False, static_root=51381):
        # _portgen = functools.partial(next, itertools.count(51381))
        key_list = [
            'collect_pull_url',
            'collect_push_url',
            'engine_pull_url',
            # 'engine_push_url',
            # 'collect_pushpull_url',
        ]
        for lane in self.engine_lanes:
            key_list.append('engine_%s_push_url' % (lane,))

        # Get ports
        if use_static_ports:
            port_list = range(static_root, static_root + len(key_list))
        else:
            port_list = []
            while len(port_list) < len(key_list):
                port = _get_random_open_port()
                if port not in port_list:
                    port_list.append(port)
            port_list = sorted(port_list)
        # Assign ports
        assert len(key_list) == len(port_list)
        self.port_dict = {
            key: '%s:%d' % (URL, port) for key, port in list(zip(key_list, port_list))
        }

[docs]    def initialize_background_processes(
        self, dbdir=None, containerized=False, thread=False
    ):
        # if VERBOSE_JOBS:
        print('Initialize Background Processes')

        def _spawner(func, *args, **kwargs):

            # if thread:
            #     _spawner_func_ = ut.spawn_background_daemon_thread
            # else:
            #     _spawner_func_ = ut.spawn_background_process
            _spawner_func_ = spawn_background_process

            proc = _spawner_func_(func, *args, **kwargs)
            assert proc.is_alive(), 'proc (%s) died too soon' % (ut.get_funcname(func))
            return proc

        if self.spawn_queue:
            self.engine_queue_proc = _spawner(
                engine_queue_loop, self.port_dict, self.engine_lanes
            )
            self.collect_queue_proc = _spawner(collect_queue_loop, self.port_dict)

        if self.spawn_collector:
            self.collect_proc = _spawner(
                collector_loop, self.port_dict, dbdir, containerized
            )

        if self.spawn_engine:
            if self.fg_engine:
                print('ENGINE IS IN DEBUG FOREGROUND MODE')
                # Spawn engine in foreground process
                assert self.num_engines == 1, 'fg engine only works with one engine'
                engine_loop(0, self.port_dict, dbdir)
                assert False, 'should never see this'
            else:
                # Normal case
                if self.engine_procs is None:
                    self.engine_procs = {}

                for lane in self.engine_lanes:
                    if lane not in self.engine_procs:
                        self.engine_procs[lane] = []
                    for i in range(self.num_engines[lane]):
                        proc = _spawner(
                            engine_loop, i, self.port_dict, dbdir, containerized, lane
                        )
                        self.engine_procs[lane].append(proc)

        # Check if online
        # wait for processes to spin up
        if self.spawn_queue:
            assert self.engine_queue_proc.is_alive(), 'engine died too soon'
            assert self.collect_queue_proc.is_alive(), 'collector queue died too soon'

        if self.spawn_collector:
            assert self.collect_proc.is_alive(), 'collector died too soon'

        if self.spawn_engine:
            for lane in self.engine_procs:
                for engine in self.engine_procs[lane]:
                    assert engine.is_alive(), 'engine died too soon'

[docs]    def get_process_alive_status(self):
        status_dict = {}

        if self.spawn_queue:
            status_dict['engine_queue'] = self.engine_queue_proc.is_alive()
            status_dict['collect_queue'] = self.collect_queue_proc.is_alive()

        if self.spawn_collector:
            status_dict['collector'] = self.collect_proc.is_alive()

        if self.spawn_engine:
            for lane in self.engine_procs:
                for id_, engine in enumerate(self.engine_procs[lane]):
                    engine_str = 'engine.%s.%s' % (lane, id_)
                    status_dict[engine_str] = engine.is_alive()

        return status_dict


[docs]def get_shelve_lock_filepath(shelve_filepath):
    shelve_lock_filepath = '%s.lock' % (shelve_filepath,)
    return shelve_lock_filepath


[docs]def touch_shelve_lock_file(shelve_filepath):
    shelve_lock_filepath = get_shelve_lock_filepath(shelve_filepath)
    assert not exists(shelve_lock_filepath)
    ut.touch(shelve_lock_filepath, verbose=False)
    assert exists(shelve_lock_filepath)


[docs]def delete_shelve_lock_file(shelve_filepath):
    shelve_lock_filepath = get_shelve_lock_filepath(shelve_filepath)
    assert exists(shelve_lock_filepath)
    ut.delete(shelve_lock_filepath, verbose=False)
    assert not exists(shelve_lock_filepath)


[docs]def wait_for_shelve_lock_file(shelve_filepath, timeout=600):
    shelve_lock_filepath = get_shelve_lock_filepath(shelve_filepath)
    start_time = time.time()
    while exists(shelve_lock_filepath):
        current_time = time.time()
        elapsed = current_time - start_time
        if elapsed >= timeout:
            return False
        time.sleep(1)
        if int(elapsed) % 5 == 0:
            print('Waiting for %0.02f seconds for lock so far' % (elapsed,))
    return True


[docs]def get_shelve_value(shelve_filepath, key):
    if shelve_filepath in [None, 'None', 'None.lock']:
        return None
    wait_for_shelve_lock_file(shelve_filepath)
    with GLOBAL_SHELVE_LOCK:
        wait_for_shelve_lock_file(shelve_filepath)
        touch_shelve_lock_file(shelve_filepath)
    value = None
    try:
        with shelve.open(shelve_filepath, 'r') as shelf:
            value = shelf.get(key)
    except Exception:
        pass
    delete_shelve_lock_file(shelve_filepath)
    return value


[docs]def set_shelve_value(shelve_filepath, key, value):
    if shelve_filepath in [None, 'None', 'None.lock']:
        return False
    wait_for_shelve_lock_file(shelve_filepath)
    with GLOBAL_SHELVE_LOCK:
        wait_for_shelve_lock_file(shelve_filepath)
        touch_shelve_lock_file(shelve_filepath)
    flag = False
    try:
        with shelve.open(shelve_filepath) as shelf:
            shelf[key] = value
        flag = True
    except Exception:
        pass
    delete_shelve_lock_file(shelve_filepath)
    return flag


[docs]def get_shelve_filepaths(ibs, jobid):
    shelve_path = ibs.get_shelves_path()
    shelve_input_filepath = abspath(join(shelve_path, '%s.input.shelve' % (jobid,)))
    shelve_output_filepath = abspath(join(shelve_path, '%s.output.shelve' % (jobid,)))
    return shelve_input_filepath, shelve_output_filepath


[docs]def initialize_process_record(
    record_filepath,
    shelve_input_filepath,
    shelve_output_filepath,
    shelve_path,
    shelve_archive_path,
    jobiface_id,
):
    MAX_ATTEMPTS = 20
    ARCHIVE_DAYS = 3

    timezone = pytz.timezone(TIMESTAMP_TIMEZONE)
    now = datetime.now(timezone)
    now = now.replace(microsecond=0)
    now = now.replace(second=0)
    now = now.replace(minute=0)
    now = now.replace(hour=0)

    archive_delta = timedelta(days=ARCHIVE_DAYS)
    archive_date = now - archive_delta
    archive_timestamp = archive_date.strftime(TIMESTAMP_FMTSTR)

    jobid = splitext(basename(record_filepath))[0]
    jobcounter = None

    # Load the engine record
    record = ut.load_cPkl(record_filepath, verbose=False)

    # Load the record info
    engine_request = record.get('request', None)
    attempts = record.get('attempts', 0)
    completed = record.get('completed', False)

    # Check status
    suppressed = attempts >= MAX_ATTEMPTS
    corrupted = engine_request is None

    # Load metadata
    metadata = get_shelve_value(shelve_input_filepath, 'metadata')

    if metadata is None:
        print('Missing metadata...corrupted')
        corrupted = True

    archived = False
    if not corrupted:
        jobcounter = metadata.get('jobcounter', None)
        times = metadata.get('times', {})

        if jobcounter is None:
            print('Missing jobcounter...corrupted')
            corrupted = True

        job_age = None
        if not corrupted and completed:
            completed_timestamp = times.get('completed', None)
            if completed_timestamp is not None:
                try:
                    archive_elapsed = calculate_timedelta(
                        completed_timestamp, archive_timestamp
                    )
                    job_age = archive_elapsed[-1]
                    archived = job_age > 0
                except Exception:
                    args = (
                        completed_timestamp,
                        archive_timestamp,
                    )
                    print(
                        '[job_engine] Could not determine archive status!\n\tCompleted: %r\n\tArchive: %r'
                        % args
                    )

            if archived:
                with ut.Indenter('[client %d] ' % (jobiface_id)):
                    color = 'brightmagenta'
                    print_ = partial(ut.colorprint, color=color)
                    print_('ARCHIVING JOB (AGE: %d SECONDS)' % (job_age,))
                    job_scr_filepath_list = list(
                        ut.iglob(join(shelve_path, '%s*' % (jobid,)))
                    )
                    for job_scr_filepath in job_scr_filepath_list:
                        job_dst_filepath = job_scr_filepath.replace(
                            shelve_path, shelve_archive_path
                        )
                        ut.copy(
                            job_scr_filepath, job_dst_filepath, overwrite=True
                        )  # ut.copy allows for overwrite, ut.move does not
                        ut.delete(job_scr_filepath)

    if archived:
        # We have archived the job, don't bother registering it
        engine_request = None
    else:
        if completed or suppressed or corrupted:
            # Register the job, pass the jobcounter and jobid only
            engine_request = None
        else:
            # We have a pending job, restart with the original request
            with ut.Indenter('[client %d] ' % (jobiface_id)):
                color = 'brightblue' if attempts == 0 else 'brightred'
                print_ = partial(ut.colorprint, color=color)

                print_(
                    'RESTARTING FAILED JOB FROM RESTART (ATTEMPT %d)' % (attempts + 1,)
                )
                print_(ut.repr3(record_filepath))
                # print_(ut.repr3(record))

                times = metadata.get('times', {})
                received = times['received']

                engine_request['restart_jobid'] = jobid
                engine_request['restart_jobcounter'] = jobcounter
                engine_request['restart_received'] = received
                record['attempts'] = attempts + 1

                ut.save_cPkl(record_filepath, record, verbose=False)

    values = jobcounter, jobid, engine_request, archived, completed, suppressed, corrupted
    return values


[docs]class JobInterface(object):
    def __init__(jobiface, id_, port_dict, ibs=None):
        jobiface.id_ = id_
        jobiface.ibs = ibs
        jobiface.verbose = 2 if VERBOSE_JOBS else 1
        jobiface.port_dict = port_dict
        print('JobInterface ports:')
        ut.print_dict(jobiface.port_dict)

    def __del__(jobiface):
        if VERBOSE_JOBS:
            print('Cleaning up job frontend')
        if jobiface.engine_recieve_socket is not None:
            jobiface.engine_recieve_socket.disconnect(
                jobiface.port_dict['engine_pull_url']
            )
            jobiface.engine_recieve_socket.close()
        if jobiface.collect_recieve_socket is not None:
            jobiface.collect_recieve_socket.disconnect(
                jobiface.port_dict['collect_pull_url']
            )
            jobiface.collect_recieve_socket.close()

    # def init(jobiface):
    #     # Starts several new processes
    #     jobiface.initialize_background_processes()
    #     # Does not create a new process, but connects sockets on this process
    #     jobiface.initialize_client_thread()

[docs]    def initialize_client_thread(jobiface):
        """
        Creates a ZMQ object in this thread. This talks to background processes.
        """
        if jobiface.verbose:
            print('Initializing JobInterface')
        jobiface.engine_recieve_socket = ctx.socket(zmq.DEALER)  # CHECK2 - REQ
        jobiface.engine_recieve_socket.setsockopt_string(
            zmq.IDENTITY, 'client%s.engine.DEALER' % (jobiface.id_,)
        )
        jobiface.engine_recieve_socket.connect(jobiface.port_dict['engine_pull_url'])
        if jobiface.verbose:
            print(
                'connect engine_pull_url = %r' % (jobiface.port_dict['engine_pull_url'],)
            )

        jobiface.collect_recieve_socket = ctx.socket(zmq.DEALER)  # CHECK2 - REQ
        jobiface.collect_recieve_socket.setsockopt_string(
            zmq.IDENTITY, 'client%s.collect.DEALER' % (jobiface.id_,)
        )
        jobiface.collect_recieve_socket.connect(jobiface.port_dict['collect_pull_url'])
        if jobiface.verbose:
            print(
                'connect collect_pull_url = %r'
                % (jobiface.port_dict['collect_pull_url'],)
            )

[docs]    def queue_interrupted_jobs(jobiface):
        import tqdm

        ibs = jobiface.ibs

        if ibs is not None:
            shelve_path = ibs.get_shelves_path()
            shelve_path = shelve_path.rstrip('/')
            shelve_archive_path = '%s_ARCHIVE' % (shelve_path,)
            ut.ensuredir(shelve_archive_path)

            record_filepath_list = _get_engine_job_paths(ibs)

            num_records = len(record_filepath_list)
            print('Reloading %d engine jobs...' % (num_records,))

            shelve_input_filepath_list = []
            shelve_output_filepath_list = []
            for record_filepath in record_filepath_list:
                jobid = splitext(basename(record_filepath))[0]
                shelve_input_filepath, shelve_output_filepath = get_shelve_filepaths(
                    ibs, jobid
                )
                shelve_input_filepath_list.append(shelve_input_filepath)
                shelve_output_filepath_list.append(shelve_output_filepath)

            arg_iter = list(
                zip(
                    record_filepath_list,
                    shelve_input_filepath_list,
                    shelve_output_filepath_list,
                    [shelve_path] * num_records,
                    [shelve_archive_path] * num_records,
                    [jobiface.id_] * num_records,
                )
            )
            if len(arg_iter) > 0:
                values_list = ut.util_parallel.generate2(
                    initialize_process_record, arg_iter
                )
                values_list = list(values_list)
            else:
                values_list = []

            print('Processed %d records' % (len(values_list),))

            restart_jobcounter_list = []
            restart_jobid_list = []
            restart_request_list = []

            global_jobcounter = 0
            num_registered, num_restarted = 0, 0
            num_completed, num_archived, num_suppressed, num_corrupted = 0, 0, 0, 0

            for values in tqdm.tqdm(values_list):
                (
                    jobcounter,
                    jobid,
                    engine_request,
                    archived,
                    completed,
                    suppressed,
                    corrupted,
                ) = values

                if archived:
                    assert engine_request is None
                    num_archived += 1
                    continue

                if jobcounter is not None:
                    global_jobcounter = max(global_jobcounter, jobcounter)

                if engine_request is None:
                    assert not archived
                    if completed:
                        status = 'completed'
                        num_completed += 1
                    elif suppressed:
                        status = 'suppressed'
                        num_suppressed += 1
                    else:
                        status = 'corrupted'
                        num_corrupted += 1

                    reply_notify = {
                        'jobid': jobid,
                        'status': status,
                        'action': 'register',
                    }
                    print('Sending register: %r' % (reply_notify,))
                    jobiface.collect_recieve_socket.send_json(reply_notify)
                    reply = jobiface.collect_recieve_socket.recv_json()
                    jobid_ = reply['jobid']
                    assert jobid_ == jobid
                else:
                    num_restarted += 1
                    restart_jobcounter_list.append(jobcounter)
                    restart_jobid_list.append(jobid)
                    restart_request_list.append(engine_request)

                num_registered += 1

            assert num_restarted == len(restart_jobcounter_list)

            print('Registered %d jobs...' % (num_registered,))
            print('\t %d completed jobs' % (num_completed,))
            print('\t %d restarted jobs' % (num_restarted,))
            print('\t %d suppressed jobs' % (num_suppressed,))
            print('\t %d corrupted jobs' % (num_corrupted,))
            print('Archived %d jobs...' % (num_archived,))

            # Update the jobcounter to be up to date
            update_notify = {
                '__set_jobcounter__': global_jobcounter,
            }
            print('Updating completed job counter: %r' % (update_notify,))
            jobiface.engine_recieve_socket.send_json(update_notify)
            reply = jobiface.engine_recieve_socket.recv_json()
            jobcounter_ = reply['jobcounter']
            assert jobcounter_ == global_jobcounter

            print('Re-sending %d engine jobs...' % (len(restart_jobcounter_list),))

            index_list = np.argsort(restart_jobcounter_list)
            zipped = list(
                zip(restart_jobcounter_list, restart_jobid_list, restart_request_list)
            )
            zipped = ut.take(zipped, index_list)

            for jobcounter, jobid, engine_request in tqdm.tqdm(zipped):
                jobiface.engine_recieve_socket.send_json(engine_request)
                reply = jobiface.engine_recieve_socket.recv_json()
                jobcounter_ = reply['jobcounter']
                jobid_ = reply['jobid']
                assert jobcounter_ == jobcounter
                assert jobid_ == jobid

[docs]    def queue_job(
        jobiface,
        action,
        callback_url=None,
        callback_method=None,
        callback_detailed=False,
        lane='slow',
        jobid=None,
        args=None,
        kwargs=None,
    ):
        r"""
        IBEIS:
            This is just a function that lives in the main thread and ships off
            a job.

        FIXME: I do not like having callback_url and callback_method specified
               like this with args and kwargs. If these must be there then
               they should be specified first, or
               THE PREFERED OPTION IS
               args and kwargs should not be specified without the * syntax

        The client - sends messages, and receives replies after they
        have been processed by the
        """
        # NAME: job_client
        with ut.Indenter('[client %d] ' % (jobiface.id_)):
            if jobiface.verbose >= 1:
                print('----')
            request = {}
            try:
                if flask.request:
                    request = {
                        'endpoint': flask.request.path,
                        'function': flask.request.endpoint,
                        'input': flask.request.processed,
                    }
            except RuntimeError:
                pass

            if args is None:
                args = tuple([])

            if kwargs is None:
                kwargs = {}

            if jobid is not None:
                assert isinstance(jobid, str)
                jobid_ = uuid.UUID(jobid)
                assert jobid_ is not None and isinstance(jobid_, uuid.UUID)

            engine_request = {
                'action': action,
                'args': args,
                'kwargs': kwargs,
                'callback_url': callback_url,
                'callback_method': callback_method,
                'callback_detailed': callback_detailed,
                'request': request,
                'restart_jobid': jobid,
                'restart_jobcounter': None,
                'restart_received': None,
                'lane': lane,
            }
            if jobiface.verbose >= 2:
                print('Queue job: %s' % (ut.repr2(engine_request, truncate=True),))

            # Send request to job
            jobiface.engine_recieve_socket.send_json(engine_request)
            reply_notify = jobiface.engine_recieve_socket.recv_json()
            print('reply_notify = %r' % (reply_notify,))
            jobid_ = reply_notify['jobid']

            if jobid is not None:
                assert jobid == jobid_
            jobid = jobid_

            ibs = jobiface.ibs
            if ibs is not None:
                shelve_path = ibs.get_shelves_path()
                ut.ensuredir(shelve_path)

                record_filename = '%s.pkl' % (jobid,)
                record_filepath = join(shelve_path, record_filename)
                record = {
                    'request': engine_request,
                    'attempts': 0,
                    'completed': False,
                }
                ut.save_cPkl(record_filepath, record, verbose=False)

            # Release memory
            action = None
            args = None
            kwargs = None
            callback_url = None
            callback_method = None
            callback_detailed = None
            request = None
            engine_request = None

            return jobid

[docs]    def get_job_id_list(jobiface):
        with ut.Indenter('[client %d] ' % (jobiface.id_)):
            if False:  # jobiface.verbose >= 1:
                print('----')
                print('Request list of job ids')
            pair_msg = dict(action='job_id_list')
            # CALLS: collector_request_status
            jobiface.collect_recieve_socket.send_json(pair_msg)
            reply = jobiface.collect_recieve_socket.recv_json()
        return reply

[docs]    def get_job_status(jobiface, jobid):
        with ut.Indenter('[client %d] ' % (jobiface.id_)):
            if jobiface.verbose >= 1:
                print('----')
                print('Request status of jobid=%r' % (jobid,))
            pair_msg = dict(action='job_status', jobid=jobid)
            # CALLS: collector_request_status
            jobiface.collect_recieve_socket.send_json(pair_msg)
            reply = jobiface.collect_recieve_socket.recv_json()
        return reply

[docs]    def get_job_status_dict(jobiface):
        with ut.Indenter('[client %d] ' % (jobiface.id_)):
            if False:  # jobiface.verbose >= 1:
                print('----')
                print('Request list of job ids')
            pair_msg = dict(action='job_status_dict')
            # CALLS: collector_request_status
            jobiface.collect_recieve_socket.send_json(pair_msg)
            reply = jobiface.collect_recieve_socket.recv_json()
        return reply

[docs]    def get_job_metadata(jobiface, jobid):
        with ut.Indenter('[client %d] ' % (jobiface.id_)):
            if jobiface.verbose >= 1:
                print('----')
                print('Request metadata of jobid=%r' % (jobid,))
            pair_msg = dict(action='job_input', jobid=jobid)
            # CALLS: collector_request_metadata
            jobiface.collect_recieve_socket.send_json(pair_msg)
            reply = jobiface.collect_recieve_socket.recv_json()
        return reply

[docs]    def get_job_result(jobiface, jobid):
        with ut.Indenter('[client %d] ' % (jobiface.id_)):
            if jobiface.verbose >= 1:
                print('----')
                print('Request result of jobid=%r' % (jobid,))
            pair_msg = dict(action='job_result', jobid=jobid)
            # CALLER: collector_request_result
            jobiface.collect_recieve_socket.send_json(pair_msg)
            reply = jobiface.collect_recieve_socket.recv_json()
        return reply

[docs]    def get_unpacked_result(jobiface, jobid):
        reply = jobiface.get_job_result(jobid)
        json_result = reply['json_result']
        try:
            result = ut.from_json(json_result)
        except TypeError as ex:
            ut.printex(ex, keys=['json_result'], iswarning=True)
            result = json_result
        except Exception as ex:
            ut.printex(ex, 'Failed to unpack result', keys=['json_result'])
            result = reply['json_result']
        # Release raw JSON result
        json_result = None
        return result

[docs]    def wait_for_job_result(jobiface, jobid, timeout=10, freq=0.1):
        t = ut.Timer(verbose=False)
        t.tic()
        while True:
            reply = jobiface.get_job_status(jobid)
            if reply['jobstatus'] == 'completed':
                return
            elif reply['jobstatus'] == 'exception':
                result = jobiface.get_unpacked_result(jobid)
                # raise Exception(result)
                print('Exception occured in engine')
                return result
            elif reply['jobstatus'] == 'working':
                pass
            elif reply['jobstatus'] == 'unknown':
                pass
            else:
                raise Exception('Unknown jobstatus=%r' % (reply['jobstatus'],))
            reply = None  # Release memory
            time.sleep(freq)
            if timeout is not None and t.toc() > timeout:
                raise Exception('Timeout')


[docs]def collect_queue_loop(port_dict):
    name = 'collect'
    assert name is not None, 'must name queue'
    queue_name = name + '_queue'
    loop_name = queue_name + '_loop'

    update_proctitle(queue_name)

    interface_pull = port_dict['%s_pull_url' % (name,)]
    interface_push = port_dict['%s_push_url' % (name,)]

    with ut.Indenter('[%s] ' % (queue_name,)):
        if VERBOSE_JOBS:
            print('Init make_queue_loop: name=%r' % (name,))
        # bind the client dealer to the queue router
        recieve_socket = ctx.socket(zmq.ROUTER)  # CHECKED - ROUTER
        recieve_socket.setsockopt_string(zmq.IDENTITY, 'queue.' + name + '.' + 'ROUTER')
        recieve_socket.bind(interface_pull)
        if VERBOSE_JOBS:
            print('bind %s_url1 = %r' % (name, interface_pull))

        # bind the server router to the queue dealer
        send_socket = ctx.socket(zmq.DEALER)  # CHECKED - DEALER
        send_socket.setsockopt_string(zmq.IDENTITY, 'queue.' + name + '.' + 'DEALER')
        send_socket.bind(interface_push)
        if VERBOSE_JOBS:
            print('bind %s_url2 = %r' % (name, interface_push))

        try:
            zmq.device(zmq.QUEUE, recieve_socket, send_socket)  # CHECKED - QUEUE
        except KeyboardInterrupt:
            print('Caught ctrl+c in queue loop. Gracefully exiting')

        recieve_socket.unbind(interface_pull)
        recieve_socket.close()
        send_socket.unbind(interface_push)
        send_socket.close()

        if VERBOSE_JOBS:
            print('Exiting %s' % (loop_name,))


[docs]def engine_queue_loop(port_dict, engine_lanes):
    """
    Specialized queue loop
    """
    # Flow of information tags:
    # NAME: engine_queue
    name = 'engine'
    queue_name = name + '_queue'
    loop_name = queue_name + '_loop'
    update_proctitle(queue_name)

    print = partial(ut.colorprint, color='red')

    interface_engine_pull = port_dict['engine_pull_url']
    interface_engine_push_dict = {
        lane: port_dict['engine_%s_push_url' % (lane,)] for lane in engine_lanes
    }
    interface_collect_pull = port_dict['collect_pull_url']

    with ut.Indenter('[%s] ' % (queue_name,)):
        print('Init specialized make_queue_loop: name=%r' % (name,))

        # bind the client dealer to the queue router
        engine_receive_socket = ctx.socket(zmq.ROUTER)  # CHECK2 - REP
        engine_receive_socket.setsockopt_string(
            zmq.IDENTITY, 'special_queue.' + name + '.' + 'ROUTER'
        )
        engine_receive_socket.bind(interface_engine_pull)
        if VERBOSE_JOBS:
            print('bind %s_url2 = %r' % (name, interface_engine_pull))

        # bind the server router to the queue dealer
        engine_send_socket_dict = {}
        for lane in interface_engine_push_dict:
            engine_send_socket = ctx.socket(zmq.DEALER)  # CHECKED - DEALER
            engine_send_socket.setsockopt_string(
                zmq.IDENTITY, 'special_queue.' + lane + '.' + name + '.' + 'DEALER'
            )
            engine_send_socket.bind(interface_engine_push_dict[lane])
            if VERBOSE_JOBS:
                print(
                    'bind %s %s_url2 = %r'
                    % (name, lane, interface_engine_push_dict[lane])
                )
            engine_send_socket_dict[lane] = engine_send_socket

        collect_recieve_socket = ctx.socket(zmq.DEALER)  # CHECKED - DEALER
        collect_recieve_socket.setsockopt_string(
            zmq.IDENTITY, queue_name + '.collect.DEALER'
        )
        collect_recieve_socket.connect(interface_collect_pull)
        if VERBOSE_JOBS:
            print('connect collect_pull_url = %r' % (interface_collect_pull))

        # but this shows what is really going on:
        poller = zmq.Poller()
        poller.register(engine_receive_socket, zmq.POLLIN)
        for lane in engine_send_socket_dict:
            engine_send_socket = engine_send_socket_dict[lane]
            poller.register(engine_send_socket, zmq.POLLIN)

        # always start at 0
        global_jobcounter = 0

        try:
            while True:
                evts = dict(poller.poll())
                if engine_receive_socket in evts:
                    # CALLER: job_client
                    idents, engine_request = rcv_multipart_json(
                        engine_receive_socket, num=1, print=print
                    )

                    set_jobcounter = engine_request.get('__set_jobcounter__', None)
                    if set_jobcounter is not None:
                        global_jobcounter = set_jobcounter
                        reply_notify = {
                            'jobcounter': global_jobcounter,
                        }
                        print(
                            '... notifying client that jobcounter was updated to %d'
                            % (global_jobcounter,)
                        )
                        # RETURNS: job_client_return
                        send_multipart_json(engine_receive_socket, idents, reply_notify)
                        continue

                    # jobid = 'jobid-%04d' % (jobcounter,)
                    jobid = '%s' % (uuid.uuid4(),)
                    jobcounter = global_jobcounter + 1
                    received = _timestamp()

                    action = engine_request['action']
                    args = engine_request['args']
                    kwargs = engine_request['kwargs']
                    callback_url = engine_request['callback_url']
                    callback_method = engine_request['callback_method']
                    callback_detailed = engine_request.get('callback_detailed', False)
                    request = engine_request['request']
                    restart_jobid = engine_request.get('restart_jobid', None)
                    restart_jobcounter = engine_request.get('restart_jobcounter', None)
                    restart_received = engine_request.get('restart_received', None)
                    lane = engine_request.get('lane', 'slow')

                    if lane not in engine_lanes:
                        print(
                            'WARNING: did not recognize desired lane %r from %r'
                            % (lane, engine_lanes)
                        )
                        print('WARNING: Defaulting to slow lane')
                        lane = 'slow'

                    engine_request['lane'] = lane

                    if restart_jobid is not None:
                        '[RESTARTING] Replacing jobid=%s with previous restart_jobid=%s' % (
                            jobid,
                            restart_jobid,
                        )
                        jobid = restart_jobid

                    if restart_jobcounter is not None:
                        '[RESTARTING] Replacing jobcounter=%s with previous restart_jobcounter=%s' % (
                            jobcounter,
                            restart_jobcounter,
                        )
                        jobcounter = restart_jobcounter

                    print('Creating jobid %r (counter %d)' % (jobid, jobcounter))

                    if restart_received is not None:
                        received = restart_received

                    ######################################################################
                    # Status: Received (Notify Collector)
                    # Reply immediately with a new jobid
                    reply_notify = {
                        'jobid': jobid,
                        'jobcounter': jobcounter,
                        'status': 'received',
                        'action': 'notification',
                    }

                    if VERBOSE_JOBS:
                        print('...notifying collector about new job')
                    # CALLS: collector_notify
                    collect_recieve_socket.send_json(reply_notify)

                    ######################################################################
                    # Status: Received (Notify Client)
                    if VERBOSE_JOBS:
                        print('... notifying client that job was accepted')
                        print('%r' % (idents,))
                        print('%r' % (reply_notify,))
                    # RETURNS: job_client_return
                    send_multipart_json(engine_receive_socket, idents, reply_notify)

                    ######################################################################
                    # Status: Metadata

                    # Reply immediately with a new jobid
                    metadata_notify = {
                        'jobid': jobid,
                        'metadata': {
                            'jobcounter': jobcounter,
                            'action': action,
                            'args': args,
                            'kwargs': kwargs,
                            'callback_url': callback_url,
                            'callback_method': callback_method,
                            'callback_detailed': callback_detailed,
                            'request': request,
                            'times': {
                                'received': received,
                                'started': None,
                                'updated': None,
                                'completed': None,
                                'runtime': None,
                                'turnaround': None,
                                'runtime_sec': None,
                                'turnaround_sec': None,
                            },
                            'lane': lane,
                        },
                        'action': 'metadata',
                    }

                    if VERBOSE_JOBS:
                        print('...notifying collector about job metadata')
                    # CALLS: collector_notify
                    collect_recieve_socket.send_json(metadata_notify)

                    ######################################################################
                    # Status: Accepted (Metadata Processed)

                    # We have been accepted, let's update the global_jobcounter
                    global_jobcounter = jobcounter

                    # Reply immediately with a new jobid
                    reply_notify = {
                        'jobid': jobid,
                        'status': 'accepted',
                        'action': 'notification',
                    }

                    if VERBOSE_JOBS:
                        print('...notifying collector about new job')
                    # CALLS: collector_notify
                    collect_recieve_socket.send_json(reply_notify)

                    ######################################################################
                    # Status: Queueing on the Engine
                    assert 'jobid' not in engine_request
                    engine_request['jobid'] = jobid

                    if VERBOSE_JOBS:
                        print('... notifying backend engine to start')
                    # CALL: engine_
                    engine_send_socket = engine_send_socket_dict[lane]
                    send_multipart_json(engine_send_socket, idents, engine_request)

                    # Release
                    idents = None
                    engine_request = None

                    ######################################################################
                    # Status: Queued
                    queued_notify = {
                        'jobid': jobid,
                        'status': 'queued',
                        'action': 'notification',
                    }

                    if VERBOSE_JOBS:
                        print('...notifying collector that job was queued')
                    # CALLS: collector_notify
                    collect_recieve_socket.send_json(queued_notify)
        except KeyboardInterrupt:
            print('Caught ctrl+c in %s queue. Gracefully exiting' % (loop_name,))

        poller.unregister(engine_receive_socket)
        for lane in engine_send_socket_dict:
            engine_send_socket = engine_send_socket_dict[lane]
            poller.unregister(engine_send_socket)

        engine_receive_socket.unbind(interface_engine_pull)
        engine_receive_socket.close()

        for lane in interface_engine_push_dict:
            engine_send_socket = engine_send_socket_dict[lane]
            engine_send_socket.unbind(interface_engine_push_dict[lane])
            engine_send_socket.close()

        collect_recieve_socket.disconnect(interface_collect_pull)
        collect_recieve_socket.close()

        if VERBOSE_JOBS:
            print('Exiting %s queue' % (loop_name,))


[docs]def engine_loop(id_, port_dict, dbdir, containerized, lane):
    r"""
    IBEIS:
        This will be part of a worker process with its own IBEISController
        instance.

        Needs to send where the results will go and then publish the results there.

    The engine_loop - receives messages, performs some action, and sends a reply,
    preserving the leading two message parts as routing identities
    """
    # NAME: engine_
    # CALLED_FROM: engine_queue
    import wbia

    try:
        import tensorflow as tf  # NOQA
        from keras import backend as K  # NOQA

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        K.set_session(sess)
    except (ImportError, RuntimeError):
        pass

    # base_print = print  # NOQA
    print = partial(ut.colorprint, color='brightgreen')
    with ut.Indenter('[engine %s %d] ' % (lane, id_)):
        interface_engine_push = port_dict['engine_%s_push_url' % (lane,)]
        interface_collect_pull = port_dict['collect_pull_url']

        if VERBOSE_JOBS:
            print('Initializing %s engine %s' % (lane, id_))
            print('connect engine_%s_push_url = %r' % (lane, interface_engine_push))

        assert dbdir is not None

        engine_send_sock = ctx.socket(zmq.ROUTER)  # CHECKED - ROUTER
        engine_send_sock.setsockopt_string(
            zmq.IDENTITY,
            'engine.%s.%s' % (lane, id_),
        )
        engine_send_sock.connect(interface_engine_push)

        collect_recieve_socket = ctx.socket(zmq.DEALER)
        collect_recieve_socket.setsockopt_string(
            zmq.IDENTITY,
            'engine.%s.%s.collect.DEALER' % (lane, id_),
        )
        collect_recieve_socket.connect(interface_collect_pull)

        if VERBOSE_JOBS:
            print('connect collect_pull_url = %r' % (interface_collect_pull,))
            print('engine is initialized')

        ibs = wbia.opendb(dbdir=dbdir, use_cache=False, web=False, daily_backup=False)
        update_proctitle('engine_loop.%s.%s' % (lane, id_), dbname=ibs.dbname)

        try:
            while True:
                try:
                    idents, engine_request = rcv_multipart_json(
                        engine_send_sock, print=print
                    )

                    action = engine_request['action']
                    jobid = engine_request['jobid']
                    args = engine_request['args']
                    kwargs = engine_request['kwargs']
                    callback_url = engine_request['callback_url']
                    callback_method = engine_request['callback_method']
                    callback_detailed = engine_request.get('callback_detailed', False)
                    lane_ = engine_request['lane']

                    if VERBOSE_JOBS:
                        print('\tjobid = %r' % (jobid,))
                        print('\taction = %r' % (action,))
                        print('\targs = %r' % (args,))
                        print('\tkwargs = %r' % (kwargs,))
                        print('\tlane = %r' % (lane,))
                        print('\tlane_ = %r' % (lane_,))

                    # Notify start working
                    reply_notify = {
                        # 'idents': idents,
                        'jobid': jobid,
                        'status': 'working',
                        'action': 'notification',
                    }
                    collect_recieve_socket.send_json(reply_notify)

                    engine_result = on_engine_request(ibs, jobid, action, args, kwargs)
                    exec_status = engine_result['exec_status']

                    # Notify start working
                    reply_notify = {
                        # 'idents': idents,
                        'jobid': jobid,
                        'status': 'publishing',
                        'action': 'notification',
                    }
                    collect_recieve_socket.send_json(reply_notify)

                    # Store results in the collector
                    collect_request = {
                        # 'idents': idents,
                        'action': 'store',
                        'jobid': jobid,
                        'engine_result': engine_result,
                        'callback_url': callback_url,
                        'callback_method': callback_method,
                        'callback_detailed': callback_detailed,
                    }
                    # if VERBOSE_JOBS:
                    print(
                        '...done working. pushing result to collector for jobid %s'
                        % (jobid,)
                    )

                    # CALLS: collector_store
                    collect_recieve_socket.send_json(collect_request)

                    # Notify start working
                    reply_notify = {
                        # 'idents': idents,
                        'jobid': jobid,
                        'status': exec_status,
                        'action': 'notification',
                    }
                    collect_recieve_socket.send_json(reply_notify)

                    # We no longer need the engine result, and can clear it's memory
                    engine_request = None
                    engine_result = None
                    collect_request = None
                except KeyboardInterrupt:
                    raise
                except Exception as ex:
                    result = ut.formatex(ex, keys=['jobid'], tb=True)
                    result = ut.strip_ansi(result)
                    print_ = partial(ut.colorprint, color='brightred')
                    with ut.Indenter('[job engine worker error] '):
                        print_(result)
                    raise
        except KeyboardInterrupt:
            print('Caught ctrl+c in engine loop. Gracefully exiting')

        engine_send_sock.disconnect(interface_engine_push)
        engine_send_sock.close()
        collect_recieve_socket.disconnect(interface_collect_pull)
        collect_recieve_socket.close()

        # Release the IBEIS controller for each job, hopefully freeing memory
        ibs = None

        # Explicitly try to release GPU memory
        try:
            import torch

            torch.cuda.empty_cache()
        except Exception:
            pass

        # Explicitly release Python memory
        try:
            import gc

            gc.collect()
        except Exception:
            pass

        # ----
        if VERBOSE_JOBS:
            print('Exiting engine loop')


[docs]def on_engine_request(
    ibs, jobid, action, args, kwargs, attempts=3, retry_delay_min=1, retry_delay_max=60
):
    """Run whenever the engine recieves a message"""
    assert attempts > 0
    attempts = int(attempts)

    assert 0 <= retry_delay_min and retry_delay_min <= 60 * 60
    retry_delay_min = int(retry_delay_min)
    assert 0 <= retry_delay_max and retry_delay_max <= 60 * 60
    retry_delay_max = int(retry_delay_max)
    assert retry_delay_min < retry_delay_max

    # Start working
    if VERBOSE_JOBS:
        print('starting job=%r' % (jobid,))
    # Map actions to IBEISController calls here
    if action == 'helloworld':

        def helloworld(time_=0, *args, **kwargs):
            time.sleep(time_)
            retval = ('HELLO time_=%r ' % (time_,)) + ut.repr2((args, kwargs))
            return retval

        action_func = helloworld
    else:
        # check for ibs func
        action_func = getattr(ibs, action)
        if VERBOSE_JOBS:
            print('resolving action=%r to wbia function=%r' % (action, action_func))

    key = '__jobid__'
    kwargs[key] = jobid
    exec_status = None
    while exec_status is None:
        try:
            attempt = 0
            while attempt < 10:  # Global max attempts of 10
                attempt += 1
                try:
                    result = action_func(*args, **kwargs)
                    break  # success, no exception, break out of the loop
                except Exception:
                    if attempt < attempts:
                        print(
                            'JOB %r FAILED (attempt %d of %d)!'
                            % (jobid, attempt, attempts)
                        )
                        retry_delay = random.uniform(retry_delay_min, retry_delay_max)
                        print('\t WAITING %0.02f SECONDS THEN RETRYING' % (retry_delay,))
                        time.sleep(retry_delay)
                    else:
                        raise
            exec_status = 'completed'
        except Exception as ex:
            # Remove __jobid__ from kwargs if it's not accepted by the action_func
            if key in kwargs:
                kwargs.pop(key, None)
                continue
            result = ut.formatex(ex, keys=['jobid'], tb=True)
            result = ut.strip_ansi(result)
            exec_status = 'exception'
    json_result = ut.to_json(result)
    result = None  # Clear any used memory
    engine_result = {
        'exec_status': exec_status,
        'json_result': json_result,
        'jobid': jobid,
    }
    return engine_result


[docs]def collector_loop(port_dict, dbdir, containerized):
    """
    Service that stores completed algorithm results
    """
    import wbia

    print = partial(ut.colorprint, color='yellow')
    with ut.Indenter('[collect] '):
        collect_rout_sock = ctx.socket(zmq.ROUTER)  # CHECK2 - PULL
        collect_rout_sock.setsockopt_string(zmq.IDENTITY, 'collect.ROUTER')
        collect_rout_sock.connect(port_dict['collect_push_url'])
        if VERBOSE_JOBS:
            print('connect collect_push_url  = %r' % (port_dict['collect_push_url'],))

        ibs = wbia.opendb(dbdir=dbdir, use_cache=False, web=False, daily_backup=False)
        update_proctitle('collector_loop', dbname=ibs.dbname)

        shelve_path = ibs.get_shelves_path()
        ut.ensuredir(shelve_path)

        collector_data = {}

        try:
            while True:
                # several callers here
                # CALLER: collector_notify
                # CALLER: collector_store
                # CALLER: collector_request_status
                # CALLER: collector_request_metadata
                # CALLER: collector_request_result
                idents, collect_request = rcv_multipart_json(
                    collect_rout_sock, print=print
                )
                try:
                    reply = on_collect_request(
                        ibs,
                        collect_request,
                        collector_data,
                        shelve_path,
                        containerized=containerized,
                    )
                except Exception as ex:
                    import traceback

                    print(ut.repr3(collect_request))
                    ut.printex(ex, 'ERROR in collection')
                    print(traceback.format_exc())
                    reply = {}

                send_multipart_json(collect_rout_sock, idents, reply)

                idents = None
                collect_request = None

                # Explicitly release Python memory
                try:
                    import gc

                    gc.collect()
                except Exception:
                    pass
        except KeyboardInterrupt:
            print('Caught ctrl+c in collector loop. Gracefully exiting')

        collect_rout_sock.disconnect(port_dict['collect_push_url'])
        collect_rout_sock.close()

        if VERBOSE_JOBS:
            print('Exiting collector')


def _timestamp():
    timezone = pytz.timezone(TIMESTAMP_TIMEZONE)
    now = datetime.now(timezone)
    timestamp = now.strftime(TIMESTAMP_FMTSTR)
    return timestamp


[docs]def invalidate_global_cache(jobid):
    global JOB_STATUS_CACHE
    JOB_STATUS_CACHE.pop(jobid, None)


[docs]def get_collector_shelve_filepaths(collector_data, jobid):
    if jobid is None:
        return None, None
    shelve_input_filepath = collector_data.get(jobid, {}).get('input', None)
    shelve_output_filepath = collector_data.get(jobid, {}).get('output', None)
    return shelve_input_filepath, shelve_output_filepath


[docs]def convert_to_date(timestamp):
    TIMESTAMP_FMTSTR_ = ' '.join(TIMESTAMP_FMTSTR.split(' ')[:-1])
    timestamp_ = ' '.join(timestamp.split(' ')[:-1])
    timestamp_date = datetime.strptime(timestamp_, TIMESTAMP_FMTSTR_)
    return timestamp_date


[docs]def calculate_timedelta(start, end):
    start_date = convert_to_date(start)
    end_date = convert_to_date(end)
    delta = end_date - start_date

    total_seconds = int(delta.total_seconds())
    total_seconds_ = total_seconds

    hours = total_seconds_ // (60 * 60)
    total_seconds_ -= hours * 60 * 60
    minutes = total_seconds_ // 60
    total_seconds_ -= minutes * 60
    seconds = total_seconds_

    return hours, minutes, seconds, total_seconds


[docs]def on_collect_request(
    ibs, collect_request, collector_data, shelve_path, containerized=False
):
    """Run whenever the collector recieves a message"""
    import requests

    action = collect_request.get('action', None)
    jobid = collect_request.get('jobid', None)
    status = collect_request.get('status', None)

    reply = {
        'status': 'ok',
        'jobid': jobid,
    }

    # Ensure we have a collector record for the jobid
    if jobid is not None:
        if jobid not in collector_data:
            collector_data[jobid] = {
                'status': None,
                'input': None,
                'output': None,
            }
        runtime_lock_filepath = join(shelve_path, '%s.lock' % (jobid,))
    else:
        runtime_lock_filepath = None

    args = get_collector_shelve_filepaths(collector_data, jobid)
    collector_shelve_input_filepath, collector_shelve_output_filepath = args

    print(
        'on_collect_request action = %r, jobid = %r, status = %r'
        % (
            action,
            jobid,
            status,
        )
    )

    if action == 'notification':
        assert None not in [jobid, runtime_lock_filepath]

        # received
        # accepted
        # queued
        # working
        # publishing
        # completed
        # exception
        # suppressed
        # corrupted

        current_status = collector_data[jobid].get('status', None)
        print('Updating jobid = %r status %r -> %r' % (jobid, current_status, status))
        collector_data[jobid]['status'] = status

        print('Notify %s' % ut.repr3(collector_data[jobid]))
        invalidate_global_cache(jobid)

        if status == 'received':
            ut.touch(runtime_lock_filepath)

        if status == 'completed':
            if exists(runtime_lock_filepath):
                ut.delete(runtime_lock_filepath)

            # Mark the engine request as finished
            record_filename = '%s.pkl' % (jobid,)
            record_filepath = join(shelve_path, record_filename)
            record = ut.load_cPkl(record_filepath, verbose=False)
            record['completed'] = True
            ut.save_cPkl(record_filepath, record, verbose=False)
            record = None

        # Update relevant times in the shelf
        if collector_shelve_input_filepath is None:
            metadata = None
        else:
            metadata = get_shelve_value(collector_shelve_input_filepath, 'metadata')

        if metadata is not None:
            times = metadata.get('times', {})
            times['updated'] = _timestamp()

            if status == 'working':
                times['started'] = _timestamp()

            if status == 'completed':
                times['completed'] = _timestamp()

            # Calculate runtime
            received = times.get('received', None)
            started = times.get('started', None)
            completed = times.get('completed', None)
            runtime = times.get('runtime', None)
            turnaround = times.get('turnaround', None)

            if None not in [started, completed] and runtime is None:
                hours, minutes, seconds, total_seconds = calculate_timedelta(
                    started, completed
                )
                args = (
                    hours,
                    minutes,
                    seconds,
                    total_seconds,
                )
                times['runtime'] = '%d hours %d min. %s sec. (total: %d sec.)' % args
                times['runtime_sec'] = total_seconds

            if None not in [received, completed] and turnaround is None:
                hours, minutes, seconds, total_seconds = calculate_timedelta(
                    received, completed
                )
                args = (
                    hours,
                    minutes,
                    seconds,
                    total_seconds,
                )
                times['turnaround'] = '%d hours %d min. %s sec. (total: %d sec.)' % args
                times['turnaround_sec'] = total_seconds

            metadata['times'] = times
            set_shelve_value(collector_shelve_input_filepath, 'metadata', metadata)

            metadata = None  # Release memory

    elif action == 'register':
        assert None not in [jobid]

        invalidate_global_cache(jobid)

        shelve_input_filepath, shelve_output_filepath = get_shelve_filepaths(ibs, jobid)
        metadata = get_shelve_value(shelve_input_filepath, 'metadata')
        engine_result = get_shelve_value(shelve_output_filepath, 'result')

        if status == 'completed':
            # Ensure we can read the data we expect out of a completed job
            if None in [metadata, engine_result]:
                status = 'corrupted'

        collector_data[jobid] = {
            'status': status,
            'input': shelve_input_filepath,
            'output': shelve_output_filepath,
        }
        print('Register %s' % ut.repr3(collector_data[jobid]))

        metadata, engine_result = None, None  # Release memory

    elif action == 'metadata':
        invalidate_global_cache(jobid)

        # From the Engine
        metadata = collect_request.get('metadata', None)

        shelve_input_filepath, shelve_output_filepath = get_shelve_filepaths(ibs, jobid)
        collector_data[jobid]['input'] = shelve_input_filepath

        set_shelve_value(shelve_input_filepath, 'metadata', metadata)

        print('Stored Metadata %s' % ut.repr3(collector_data[jobid]))

        metadata = None  # Release memory

    elif action == 'store':
        invalidate_global_cache(jobid)

        # From the Engine
        engine_result = collect_request.get('engine_result', None)
        callback_url = collect_request.get('callback_url', None)
        callback_method = collect_request.get('callback_method', None)
        callback_detailed = collect_request.get('callback_detailed', False)

        # Get the engine result jobid
        jobid = engine_result.get('jobid', jobid)
        assert jobid in collector_data

        shelve_input_filepath, shelve_output_filepath = get_shelve_filepaths(ibs, jobid)
        collector_data[jobid]['output'] = shelve_output_filepath

        set_shelve_value(shelve_output_filepath, 'result', engine_result)

        print('Stored Result %s' % ut.repr3(collector_data[jobid]))

        engine_result = None  # Release memory

        if callback_url is not None:
            if containerized:
                callback_url = callback_url.replace('://localhost/', '://wildbook:8080/')

            if callback_method is None:
                callback_method = 'POST'

            callback_method = callback_method.upper()
            message = 'callback_method %r unsupported' % (callback_method,)
            assert callback_method in ['POST', 'GET', 'PUT'], message

            try:
                data_dict = {'jobid': jobid}

                if callback_detailed:
                    shelve_value = get_shelve_value(shelve_output_filepath, 'result')
                    data_dict['status'] = shelve_value['exec_status']
                    data_dict['json_result'] = ut.from_json(shelve_value['json_result'])
                    shelve_value = None  # Release memory

                args = (
                    callback_url,
                    callback_method,
                    data_dict,
                )
                print(
                    'Attempting job completion callback to %r\n\tHTTP Method: %r\n\tData Payload: %r'
                    % args
                )

                # Perform callback
                if callback_method == 'POST':
                    if callback_url.startswith('houston+'):
                        # Remove houston+ from callback_url
                        call_houston(
                            callback_url[8:],
                            method='POST',
                            data=ut.to_json(data_dict),
                            headers={'Content-Type': 'application/json'},
                        )
                    response = requests.post(callback_url, data=data_dict)
                elif callback_method == 'GET':
                    if callback_url.startswith('houston+'):
                        # Remove houston+ from callback_url
                        call_houston(callback_url[8:], method='GET', params=data_dict)
                    response = requests.get(callback_url, params=data_dict)
                elif callback_method == 'PUT':
                    if callback_url.startswith('houston+'):
                        # Remove houston+ from callback_url
                        call_houston(
                            callback_url[8:],
                            method='PUT',
                            data=ut.to_json(data_dict),
                            headers={'Content-Type': 'application/json'},
                        )
                    response = requests.put(callback_url, data=data_dict)
                else:
                    raise RuntimeError()

                # Check response
                try:
                    text = unicode(response.text).encode('utf-8')  # NOQA
                except Exception:
                    text = None

                args = (
                    response,
                    text,
                )
                print('Callback completed...\n\tResponse: %r\n\tText: %r' % args)
            except Exception:
                print('Callback FAILED!')

    elif action == 'job_status':
        reply['jobstatus'] = collector_data.get(jobid, {}).get('status', 'unknown')

    elif action == 'job_status_dict':
        json_result = {}

        for jobid in collector_data:

            if jobid in JOB_STATUS_CACHE:
                job_status_data = JOB_STATUS_CACHE.get(jobid, None)
            else:
                status = collector_data[jobid]['status']

                shelve_input_filepath, shelve_output_filepath = get_shelve_filepaths(
                    ibs, jobid
                )
                metadata = get_shelve_value(shelve_input_filepath, 'metadata')

                cache = True
                if metadata is None:
                    if status in ['corrupted']:
                        status = 'corrupted'
                    elif status in ['suppressed']:
                        status = 'suppressed'
                    elif status in ['completed']:
                        status = 'corrupted'
                    else:
                        # status = 'pending'
                        cache = False
                    metadata = {
                        'jobcounter': -1,
                    }

                times = metadata.get('times', {})
                request = metadata.get('request', {})

                # Support legacy jobs
                if request is None:
                    request = {}

                job_status_data = {
                    'status': status,
                    'jobcounter': metadata.get('jobcounter', None),
                    'action': metadata.get('action', None),
                    'endpoint': request.get('endpoint', None),
                    'function': request.get('function', None),
                    'time_received': times.get('received', None),
                    'time_started': times.get('started', None),
                    'time_runtime': times.get('runtime', None),
                    'time_updated': times.get('updated', None),
                    'time_completed': times.get('completed', None),
                    'time_turnaround': times.get('turnaround', None),
                    'time_runtime_sec': times.get('runtime_sec', None),
                    'time_turnaround_sec': times.get('turnaround_sec', None),
                    'lane': metadata.get('lane', None),
                }
                if cache:
                    JOB_STATUS_CACHE[jobid] = job_status_data

            json_result[jobid] = job_status_data

        reply['json_result'] = json_result

        metadata = None  # Release memory

    elif action == 'job_id_list':
        reply['jobid_list'] = sorted(list(collector_data.keys()))

    elif action == 'job_input':
        if jobid not in collector_data:
            reply['status'] = 'invalid'
            metadata = None
        else:
            metadata = get_shelve_value(collector_shelve_input_filepath, 'metadata')
            if metadata is None:
                reply['status'] = 'corrupted'

        reply['json_result'] = metadata

        metadata = None  # Release memory

    elif action == 'job_result':
        if jobid not in collector_data:
            reply['status'] = 'invalid'
            result = None
        else:
            status = collector_data[jobid]['status']

            engine_result = get_shelve_value(collector_shelve_output_filepath, 'result')

            if engine_result is None:
                if status in ['corrupted']:
                    status = 'corrupted'
                elif status in ['suppressed']:
                    status = 'suppressed'
                elif status in ['completed']:
                    status = 'corrupted'
                else:
                    # status = 'pending'
                    pass
                reply['status'] = status
                result = None
            else:
                reply['status'] = engine_result['exec_status']

                json_result = engine_result['json_result']
                result = ut.from_json(json_result)

        reply['json_result'] = result

        engine_result = None  # Release memory
    else:
        # Other
        print('...error unknown action=%r' % (action,))
        reply['status'] = 'error'

    return reply


[docs]def send_multipart_json(sock, idents, reply):
    """helper"""
    reply_json = ut.to_json(reply).encode('utf-8')
    reply = None
    multi_reply = idents + [reply_json]
    sock.send_multipart(multi_reply)


[docs]def rcv_multipart_json(sock, num=2, print=print):
    """helper"""
    # note that the first two parts will be ['Controller.ROUTER', 'Client.<id_>']
    # these are needed for the reply to propagate up to the right client
    multi_msg = sock.recv_multipart()
    if VERBOSE_JOBS:
        print('----')
        print('RCV Json: %s' % (ut.repr2(multi_msg, truncate=True),))
    idents = multi_msg[:num]
    request_json = multi_msg[num]
    request = ut.from_json(request_json)
    request_json = None
    multi_msg = None
    return idents, request


def _on_ctrl_c(signal, frame):
    print('[wbia.zmq] Caught ctrl+c')
    print('[wbia.zmq] sys.exit(0)')
    import sys

    sys.exit(0)


def _init_signals():
    import signal

    signal.signal(signal.SIGINT, _on_ctrl_c)


if __name__ == '__main__':
    """
    CommandLine:
        python -m ibeis.web.job_engine
        python -m ibeis.web.job_engine --allexamples
        python -m ibeis.web.job_engine --allexamples --noface --nosrc
    """
    import multiprocessing

    multiprocessing.freeze_support()  # for win32
    import utool as ut  # NOQA

    ut.doctest_funcs()