Source code for resolwe.flow.management.commands.register

""".. Ignore pydocstyle D400.

==================
Register Processes
==================

"""
import os
import re

import jsonschema
import yaml
from versionfield.utils import convert_version_string_to_int

from django.conf import settings
from django.contrib.auth import get_user_model
from django.core.exceptions import ValidationError
from django.core.management.base import BaseCommand
from django.db.models import Max, Q

from resolwe.flow.engine import InvalidEngineError
from resolwe.flow.finders import get_finders
from resolwe.flow.managers import manager
from resolwe.flow.models import DescriptorSchema, Process
from resolwe.flow.models.base import VERSION_NUMBER_BITS
from resolwe.flow.models.utils import validate_schema, validation_schema
from resolwe.flow.utils import dict_dot, iterate_schema
from resolwe.permissions.utils import assign_contributor_permissions, copy_permissions

PROCESSOR_SCHEMA = validation_schema("processor")
DESCRIPTOR_SCHEMA = validation_schema("descriptor")

SCHEMA_TYPE_DESCRIPTOR = "descriptor"
SCHEMA_TYPE_PROCESS = "process"


[docs]class Command(BaseCommand):
    """Register processes."""

    help = "Register processes"

[docs]    def add_arguments(self, parser):
        """Command arguments."""
        parser.add_argument(
            "-f",
            "--force",
            action="store_true",
            help="register also if version mismatch",
        )
        parser.add_argument(
            "--retire",
            default=False,
            action="store_true",
            help="retire obsolete processes",
        )

[docs]    def valid(self, instance, schema):
        """Validate schema."""
        try:
            jsonschema.validate(instance, schema)
        except jsonschema.exceptions.ValidationError as ex:
            self.stderr.write(
                "    VALIDATION ERROR: {}".format(
                    instance["name"] if "name" in instance else ""
                )
            )
            self.stderr.write("        path:       {}".format(ex.path))
            self.stderr.write("        message:    {}".format(ex.message))
            self.stderr.write("        validator:  {}".format(ex.validator))
            self.stderr.write("        val. value: {}".format(ex.validator_value))
            return False

        try:
            # Check that default values fit field schema.
            for field in ["input", "output", "schema"]:
                for schema, _, path in iterate_schema({}, instance.get(field, {})):
                    if "default" in schema:
                        validate_schema({schema["name"]: schema["default"]}, [schema])
        except ValidationError:
            self.stderr.write("    VALIDATION ERROR: {}".format(instance["name"]))
            self.stderr.write(
                "        Default value of field '{}' is not valid.".format(path)
            )
            return False

        return True

[docs]    def find_descriptor_schemas(self, schema_file):
        """Find descriptor schemas in given path."""
        if not schema_file.lower().endswith((".yml", ".yaml")):
            return []

        with open(schema_file) as fn:
            schemas = yaml.load(fn, Loader=yaml.FullLoader)
        if not schemas:
            self.stderr.write("Could not read YAML file {}".format(schema_file))
            return []

        descriptor_schemas = []
        for schema in schemas:
            if "schema" not in schema:
                continue

            descriptor_schemas.append(schema)

        return descriptor_schemas

[docs]    def find_schemas(self, schema_path, schema_type=SCHEMA_TYPE_PROCESS, verbosity=1):
        """Find schemas in packages that match filters."""
        schema_matches = []

        if not os.path.isdir(schema_path):
            if verbosity > 0:
                self.stdout.write("Invalid path {}".format(schema_path))
            return

        if schema_type not in [SCHEMA_TYPE_PROCESS, SCHEMA_TYPE_DESCRIPTOR]:
            raise ValueError("Invalid schema type")

        for root, _, files in os.walk(schema_path):
            for schema_file in [os.path.join(root, fn) for fn in files]:
                schemas = None
                if schema_type == SCHEMA_TYPE_DESCRIPTOR:
                    # Discover descriptors.
                    schemas = self.find_descriptor_schemas(schema_file)
                elif schema_type == SCHEMA_TYPE_PROCESS:
                    # Perform process discovery for all supported execution engines.
                    schemas = []
                    for execution_engine in manager.execution_engines.values():
                        schemas.extend(execution_engine.discover_process(schema_file))

                for schema in schemas:
                    schema_matches.append(schema)

        return schema_matches

[docs]    def register_processes(self, process_schemas, user, force=False, verbosity=1):
        """Read and register processors."""
        log_processors = []
        log_templates = []

        for p in process_schemas:
            # TODO: Remove this when all processes are migrated to the
            #       new syntax.
            if "flow_collection" in p:
                if "entity" in p:
                    self.stderr.write(
                        "Skip processor {}: only one of 'flow_collection' and 'entity' fields "
                        "allowed".format(p["slug"])
                    )
                    continue

                p["entity"] = {"type": p.pop("flow_collection")}

            if p["type"][-1] != ":":
                p["type"] += ":"

            if "category" in p and not p["category"].endswith(":"):
                p["category"] += ":"

            for field in ["input", "output"]:
                for schema, _, _ in iterate_schema({}, p[field] if field in p else {}):
                    if not schema["type"][-1].endswith(":"):
                        schema["type"] += ":"
            # TODO: Check if schemas validate with our JSON meta schema and Processor model docs.

            if not self.valid(p, PROCESSOR_SCHEMA):
                continue

            if "entity" in p:
                if "type" not in p["entity"]:
                    self.stderr.write(
                        "Skip process {}: 'entity.type' required if 'entity' defined".format(
                            p["slug"]
                        )
                    )
                    continue
                if "input" in p["entity"] and p["entity"].get("always_create", False):
                    self.stderr.write(
                        "Skip process {}: 'entity.input' will not be considered if 'entity.always_create' "
                        "is set to true.".format(p["slug"])
                    )
                    continue

                p["entity_type"] = p["entity"]["type"]
                p["entity_descriptor_schema"] = p["entity"].get(
                    "descriptor_schema", p["entity_type"]
                )
                p["entity_input"] = p["entity"].get("input", None)
                p["entity_always_create"] = p["entity"].get("always_create", False)
                p.pop("entity")

                if not DescriptorSchema.objects.filter(
                    slug=p["entity_descriptor_schema"]
                ).exists():
                    self.stderr.write(
                        "Skip processor {}: Unknown descriptor schema '{}' used in 'entity' "
                        "field.".format(p["slug"], p["entity_descriptor_schema"])
                    )
                    continue

            if "persistence" in p:
                persistence_mapping = {
                    "RAW": Process.PERSISTENCE_RAW,
                    "CACHED": Process.PERSISTENCE_CACHED,
                    "TEMP": Process.PERSISTENCE_TEMP,
                }

                p["persistence"] = persistence_mapping[p["persistence"]]

            if "scheduling_class" in p:
                scheduling_class_mapping = {
                    "interactive": Process.SCHEDULING_CLASS_INTERACTIVE,
                    "batch": Process.SCHEDULING_CLASS_BATCH,
                }

                p["scheduling_class"] = scheduling_class_mapping[p["scheduling_class"]]

            if "input" in p:
                p["input_schema"] = p.pop("input")

            if "output" in p:
                p["output_schema"] = p.pop("output")

            slug = p["slug"]

            if "run" in p:
                # Set default language to 'bash' if not set.
                p["run"].setdefault("language", "bash")

                # Transform output schema using the execution engine.
                try:
                    execution_engine = manager.get_execution_engine(
                        p["run"]["language"]
                    )
                    extra_output_schema = execution_engine.get_output_schema(p)
                    if extra_output_schema:
                        p.setdefault("output_schema", []).extend(extra_output_schema)
                except InvalidEngineError:
                    self.stderr.write(
                        "Skip processor {}: execution engine '{}' not supported".format(
                            slug, p["run"]["language"]
                        )
                    )
                    continue

            # Validate if container image is allowed based on the configured pattern.
            # NOTE: This validation happens here and is not deferred to executors because the idea
            #       is that this will be moved to a "container" requirement independent of the
            #       executor.
            if hasattr(settings, "FLOW_CONTAINER_VALIDATE_IMAGE"):
                try:
                    container_image = dict_dot(p, "requirements.executor.docker.image")
                    if not re.match(
                        settings.FLOW_CONTAINER_VALIDATE_IMAGE, container_image
                    ):
                        self.stderr.write(
                            "Skip processor {}: container image does not match '{}'".format(
                                slug, settings.FLOW_CONTAINER_VALIDATE_IMAGE
                            )
                        )
                        continue
                except KeyError:
                    pass

            version = p["version"]
            int_version = convert_version_string_to_int(version, VERSION_NUMBER_BITS)

            # `latest version` is returned as `int` so it has to be compared to `int_version`
            latest_version = Process.objects.filter(slug=slug).aggregate(
                Max("version")
            )["version__max"]
            if latest_version is not None and latest_version > int_version:
                self.stderr.write(
                    "Skip processor {}: newer version installed".format(slug)
                )
                continue

            previous_process_qs = Process.objects.filter(slug=slug)
            if previous_process_qs.exists():
                previous_process = previous_process_qs.latest()
            else:
                previous_process = None

            process_query = Process.objects.filter(slug=slug, version=version)
            if process_query.exists():
                is_active = process_query.values_list("is_active", flat=True).get()
                if not is_active:
                    p["is_active"] = True
                    if verbosity > 0:
                        self.stdout.write(
                            "Processor {}: setting is_active to True".format(slug)
                        )

                if not force:
                    if verbosity > 0:
                        self.stdout.write(
                            "Skip processor {}: same version installed".format(slug)
                        )
                    continue

                process_query.update(**p)
                log_processors.append("Updated {}".format(slug))
            else:
                process = Process.objects.create(contributor=user, **p)
                assign_contributor_permissions(process)
                if previous_process:
                    copy_permissions(previous_process, process)
                log_processors.append("Inserted {}".format(slug))

        if verbosity > 0:
            if log_processors:
                self.stdout.write("Processor Updates:")
                for log in log_processors:
                    self.stdout.write("  {}".format(log))

            if log_templates:
                self.stdout.write("Default Template Updates:")
                for log in log_templates:
                    self.stdout.write("  {}".format(log))

[docs]    def register_descriptors(self, descriptor_schemas, user, force=False, verbosity=1):
        """Read and register descriptors."""
        log_descriptors = []

        for descriptor_schema in descriptor_schemas:
            for schema, _, _ in iterate_schema({}, descriptor_schema.get("schema", {})):
                if not schema["type"][-1].endswith(":"):
                    schema["type"] += ":"

            if "schema" not in descriptor_schema:
                descriptor_schema["schema"] = []

            if not self.valid(descriptor_schema, DESCRIPTOR_SCHEMA):
                continue

            slug = descriptor_schema["slug"]
            version = descriptor_schema.get("version", "0.0.0")
            int_version = convert_version_string_to_int(version, VERSION_NUMBER_BITS)

            # `latest version` is returned as `int` so it has to be compared to `int_version`
            latest_version = DescriptorSchema.objects.filter(slug=slug).aggregate(
                Max("version")
            )["version__max"]
            if latest_version is not None and latest_version > int_version:
                self.stderr.write(
                    "Skip descriptor schema {}: newer version installed".format(slug)
                )
                continue

            previous_descriptor_qs = DescriptorSchema.objects.filter(slug=slug)
            if previous_descriptor_qs.exists():
                previous_descriptor = previous_descriptor_qs.latest()
            else:
                previous_descriptor = None

            descriptor_query = DescriptorSchema.objects.filter(
                slug=slug, version=version
            )
            if descriptor_query.exists():
                if not force:
                    if verbosity > 0:
                        self.stdout.write(
                            "Skip descriptor schema {}: same version installed".format(
                                slug
                            )
                        )
                    continue

                descriptor_query.update(**descriptor_schema)
                log_descriptors.append("Updated {}".format(slug))
            else:
                descriptor = DescriptorSchema.objects.create(
                    contributor=user, **descriptor_schema
                )
                assign_contributor_permissions(descriptor)
                if previous_descriptor:
                    copy_permissions(previous_descriptor, descriptor)
                log_descriptors.append("Inserted {}".format(slug))

        if log_descriptors and verbosity > 0:
            self.stdout.write("Descriptor schemas Updates:")
            for log in log_descriptors:
                self.stdout.write("  {}".format(log))

[docs]    def retire(self, process_schemas):
        """Retire obsolete processes.

        Remove old process versions without data. Find processes that have been
        registered but do not exist in the code anymore, then:

        - If they do not have data: remove them
        - If they have data: flag them not active (``is_active=False``)

        """
        process_slugs = set(ps["slug"] for ps in process_schemas)

        # Processes that are in DB but not in the code
        retired_processes = Process.objects.filter(~Q(slug__in=process_slugs))

        # Remove retired processes which do not have data
        retired_processes.filter(data__exact=None).delete()

        # Remove non-latest processes which do not have data
        latest_version_processes = Process.objects.order_by(
            "slug", "-version"
        ).distinct("slug")
        Process.objects.filter(data__exact=None).exclude(
            id__in=latest_version_processes
        ).delete()

        # Deactivate retired processes which have data
        retired_processes.update(is_active=False)

[docs]    def handle(self, *args, **options):
        """Register processes."""
        force = options.get("force")
        retire = options.get("retire")
        verbosity = int(options.get("verbosity"))

        users = (
            get_user_model().objects.filter(is_superuser=True).order_by("date_joined")
        )

        if not users.exists():
            self.stderr.write("Admin does not exist: create a superuser")
            exit(1)

        process_paths, descriptor_paths = [], []
        process_schemas, descriptor_schemas = [], []

        for finder in get_finders():
            process_paths.extend(finder.find_processes())
            descriptor_paths.extend(finder.find_descriptors())

        for proc_path in process_paths:
            process_schemas.extend(
                self.find_schemas(
                    proc_path, schema_type=SCHEMA_TYPE_PROCESS, verbosity=verbosity
                )
            )

        for desc_path in descriptor_paths:
            descriptor_schemas.extend(
                self.find_schemas(
                    desc_path, schema_type=SCHEMA_TYPE_DESCRIPTOR, verbosity=verbosity
                )
            )

        user_admin = users.first()
        self.register_descriptors(
            descriptor_schemas, user_admin, force, verbosity=verbosity
        )
        # NOTE: Descriptor schemas must be registered first, so
        #       processes can validate 'entity_descriptor_schema' field.
        self.register_processes(process_schemas, user_admin, force, verbosity=verbosity)

        if retire:
            self.retire(process_schemas)

        if verbosity > 0:
            self.stdout.write("Running executor post-registration hook...")
        manager.get_executor().post_register_hook(verbosity=verbosity)