Source code for _delb.plugins.https_loader

# Copyright (C) 2018-'22  Frank Sachsenheim
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published
# by the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.


"""
If ``delb`` is installed with ``https-loader`` as extra, the required
dependencies for this loader are installed as well. See :doc:`installation`.
"""


from __future__ import annotations

from io import IOBase
from types import SimpleNamespace
from typing import Any, Iterator, Optional

from _delb.plugins import plugin_manager
from _delb.plugins.core_loaders import buffer_loader, ftp_loader
from _delb.typing import LoaderResult


try:
    import httpx  # noqa: F401
except ImportError:
    __all__: tuple[str, ...] = ()
else:
    DEFAULT_CLIENT = httpx.Client(follow_redirects=True, http2=True)

    class HttpsStreamWrapper(IOBase):
        __slots__ = ("_generator", "_response")

        def __init__(self, response: httpx.Response):
            self._generator: Optional[Iterator[bytes]] = None
            self._response = response

        def read(self, size: int = 4096) -> bytes:
            if self._generator is None:
                self._generator = self._response.iter_bytes(chunk_size=size)

            try:
                return next(self._generator)
            except StopIteration:
                return b""

[docs] @plugin_manager.register_loader(before=ftp_loader) def https_loader( data: Any, config: SimpleNamespace, client: httpx.Client = DEFAULT_CLIENT ) -> LoaderResult: """ This loader loads a document from a URL with the ``http`` and ``https`` scheme. Redirects are followed. The default httpx_-client follows redirects and can partially be configured with `environment variables`_. The URL will be bound to the name ``source_url`` on the document's :attr:`Document.config` attribute. Loaders with specifically configured httpx-clients can build on this loader like so: .. testcode:: import httpx from _delb.plugins import plugin_manager from _delb.plugins.https_loader import https_loader client = httpx.Client(follow_redirects=False, trust_env=False) @plugin_manager.register_loader(before=https_loader) def custom_https_loader(data, config): return https_loader(data, config, client=client) .. _environment variables: https://www.python-httpx.org/environment_variables/ .. _httpx: https://www.python-httpx.org/ """ if isinstance(data, str) and data.lower().startswith(("http://", "https://")): with client.stream("get", url=data) as response: response.raise_for_status() result = buffer_loader(HttpsStreamWrapper(response), config) config.source_url = data return result return "The input value is not an URL with the http or https scheme."
__all__ = (https_loader.__name__,)