diff data_schema/util.py @ 5:84dfd1a94926

Add the existing implementation. All tests work. The documentation as text file is included also.
author Franz Glasner <fzglas.hg@dom66.de>
date Thu, 06 Jul 2023 23:41:41 +0200
parents
children 2352d14ae261
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data_schema/util.py	Thu Jul 06 23:41:41 2023 +0200
@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+# :-
+# :Copyright: (c) 2023 Franz Glasner
+# :License: BSD-3-Clause. See LICENSE.txt for details.
+# :-
+r"""Some utility functions for use within the package.
+
+"""
+
+__all__ = ["get_data_stream"]
+
+
+try:
+    from importlib import resources as il_resources
+except ImportError:
+    il_resources = None
+    import pkg_resources
+
+import rfc3986
+import rfc3986.validators
+
+
+def _is_safe_path(path):
+    if any(sep in path for sep in ('\\', ':')):
+        return False
+    if path.startswith("../"):
+        return False
+    if path.endswith("/.."):
+        return False
+    if "/../" in path:
+        return False
+    return True
+
+
+def get_data_stream(uri, basedir=None, basepackage=None):
+    """
+
+    "data:" URIs are resolved as Python package resources for packages
+    `package`.  by default this is the package where this module lives
+    in.
+
+    "file:" URIs are resolved by prepending `basedir` to the URI path.
+
+    "data:" URIs are resolve within "<basepackage>.packagedata".
+
+    The returned stream needs to be closes as usual.
+
+    """
+    u = rfc3986.URIReference.from_string(uri).normalize()
+    if u.scheme == "data":
+        if u.authority or u.query or u.fragment:
+            raise ValueError("invalid data URI: authority, query and "
+                             "fragment MUST be empty")
+        if not rfc3986.validators.path_is_valid(u.path, require=True):
+            raise ValueError("invalid or empty empty path within a data URI")
+        if u.path.find('%') >= 0:
+            raise ValueError("URI encoded paths not supported")
+        datapackage, sep, datapath = u.path.partition(':')
+        if sep:
+            if not datapackage:
+                datapackage = basepackage
+            if datapath.find(':') >= 0:
+                raise ValueError("colon in an URI's path not supported")
+        else:
+            datapackage = basepackage
+            datapath = u.path
+        # urllib3 normalizes to absolute paths: just to be sure
+        if "//" in datapath:
+            raise ValueError(
+                "URI path for the `data' scheme contains `//' substring")
+        if not datapath.startswith('/'):
+            if datapackage is None:
+                raise ValueError("missing the data package")
+            if il_resources:
+                datapath_parts = datapath.rsplit('/', 1)
+                datapath_dirs = datapath_parts[:-1]
+                datapath_file = datapath_parts[-1]
+                if datapath_dirs:
+                    datapath_sep = '.'
+                else:
+                    datapath_sep = ''
+                return il_resources.open_binary(
+                    datapackage + '.packagedata' + datapath_sep
+                        + '.'.join(datapath_dirs),               # noqa: E131
+                    datapath_file)
+            else:
+                return pkg_resources.resource_stream(  # noqa:E501    # pylint:disable=used-before-assignment
+                    datapackage, "packagedata/" + datapath)
+        else:
+            raise ValueError(
+                "URI path for the `data' scheme must not be absolute")
+    elif u.scheme == "file":
+        if u.authority or u.query or u.fragment:
+            raise ValueError("invalid file URI: authority, query and "
+                             "fragment MUST be empty")
+        if not rfc3986.validators.path_is_valid(u.path, require=True):
+            raise ValueError("invalid or empty empty path within a file URI")
+        if u.path.find('%') >= 0:
+            raise ValueError(
+                "percent-encoded paths not supported in data-stream file URI")
+        if not _is_safe_path(u.path):
+            raise ValueError("unsafe path in file URI is not supported")
+        if u.path.startswith('/'):
+            # resolve the file relative to the projectdir
+            if basedir is None:
+                raise TypeError("no base directory in `basedir' given")
+            return open("{}/{}".format(basedir.rstrip("/\\"),
+                                       u.path.lstrip('/')),
+                        "rb")
+        else:
+            raise ValueError("relative file URI not handled")
+    else:
+        raise ValueError("scheme `{}' not supported".format(u.scheme))