matrix-org_synapse/synapse/config/oembed.py

# Copyright 2021 The Matrix.org Foundation C.I.C.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import re
from typing import Any, Dict, Iterable, List, Optional, Pattern
from urllib import parse as urlparse

import attr
import pkg_resources

from synapse.types import JsonDict, StrSequence

from ._base import Config, ConfigError
from ._util import validate_config


@attr.s(slots=True, frozen=True, auto_attribs=True)
class OEmbedEndpointConfig:
    # The API endpoint to fetch.
    api_endpoint: str
    # The patterns to match.
    url_patterns: List[Pattern[str]]
    # The supported formats.
    formats: Optional[List[str]]


class OembedConfig(Config):
    """oEmbed Configuration"""

    section = "oembed"

    def read_config(self, config: JsonDict, **kwargs: Any) -> None:
        oembed_config: Dict[str, Any] = config.get("oembed") or {}

        # A list of patterns which will be used.
        self.oembed_patterns: List[OEmbedEndpointConfig] = list(
            self._parse_and_validate_providers(oembed_config)
        )

    def _parse_and_validate_providers(
        self, oembed_config: dict
    ) -> Iterable[OEmbedEndpointConfig]:
        """Extract and parse the oEmbed providers from the given JSON file.

        Returns a generator which yields the OidcProviderConfig objects
        """
        # Whether to use the packaged providers.json file.
        if not oembed_config.get("disable_default_providers") or False:
            with pkg_resources.resource_stream("synapse", "res/providers.json") as s:
                providers = json.load(s)

            yield from self._parse_and_validate_provider(
                providers, config_path=("oembed",)
            )

        # The JSON files which includes additional provider information.
        for i, file in enumerate(oembed_config.get("additional_providers") or []):
            # TODO Error checking.
            with open(file) as f:
                providers = json.load(f)

            yield from self._parse_and_validate_provider(
                providers,
                config_path=(
                    "oembed",
                    "additional_providers",
                    f"<item {i}>",
                ),
            )

    def _parse_and_validate_provider(
        self, providers: List[JsonDict], config_path: StrSequence
    ) -> Iterable[OEmbedEndpointConfig]:
        # Ensure it is the proper form.
        validate_config(
            _OEMBED_PROVIDER_SCHEMA,
            providers,
            config_path=config_path,
        )

        # Parse it and yield each result.
        for provider in providers:
            # Each provider might have multiple API endpoints, each which
            # might have multiple patterns to match.
            for endpoint in provider["endpoints"]:
                api_endpoint = endpoint["url"]

                # The API endpoint must be an HTTP(S) URL.
                results = urlparse.urlparse(api_endpoint)
                if results.scheme not in {"http", "https"}:
                    raise ConfigError(
                        f"Unsupported oEmbed scheme ({results.scheme}) for endpoint {api_endpoint}",
                        config_path,
                    )

                patterns = [
                    self._glob_to_pattern(glob, config_path)
                    for glob in endpoint["schemes"]
                ]
                yield OEmbedEndpointConfig(
                    api_endpoint, patterns, endpoint.get("formats")
                )

    def _glob_to_pattern(self, glob: str, config_path: StrSequence) -> Pattern:
        """
        Convert the glob into a sane regular expression to match against. The
        rules followed will be slightly different for the domain portion vs.
        the rest.

        1. The scheme must be one of HTTP / HTTPS (and have no globs).
        2. The domain can have globs, but we limit it to characters that can
           reasonably be a domain part.
           TODO: This does not attempt to handle Unicode domain names.
           TODO: The domain should not allow wildcard TLDs.
        3. Other parts allow a glob to be any one, or more, characters.
        """
        results = urlparse.urlparse(glob)

        # The scheme must be HTTP(S) (and cannot contain wildcards).
        if results.scheme not in {"http", "https"}:
            raise ConfigError(
                f"Unsupported oEmbed scheme ({results.scheme}) for pattern: {glob}",
                config_path,
            )

        pattern = urlparse.urlunparse(
            [
                results.scheme,
                re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"),
            ]
            + [re.escape(part).replace("\\*", ".+") for part in results[2:]]
        )
        return re.compile(pattern)


_OEMBED_PROVIDER_SCHEMA = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "provider_name": {"type": "string"},
            "provider_url": {"type": "string"},
            "endpoints": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "schemes": {
                            "type": "array",
                            "items": {"type": "string"},
                        },
                        "url": {"type": "string"},
                        "formats": {"type": "array", "items": {"type": "string"}},
                        "discovery": {"type": "boolean"},
                    },
                    "required": ["schemes", "url"],
                },
            },
        },
        "required": ["provider_name", "provider_url", "endpoints"],
    },
}