Source code for pyxu.operator.linop.stencil._stencil

  1"""
  2Low-level functions used to define user-facing stencils.
  3"""
  4import collections.abc as cabc
  5import itertools
  6import pathlib as plib
  7import string
  8import types
  9
 10import numpy as np
 11
 12import pyxu.info.config as pxcfg
 13import pyxu.info.deps as pxd
 14import pyxu.info.ptype as pxt
 15import pyxu.runtime as pxrt
 16import pyxu.util as pxu
 17
 18
 19def _signature(params, returns) -> str:
 20    # Translate a signature of the form
 21    #     [in_1_spec, ..., in_N_spec] -> out_spec
 22    # to Numba's string representation.
 23    #
 24    # Parameters
 25    # ----------
 26    # params: list(spec)
 27    # returns: spec | None
 28    #
 29    # Returns
 30    # -------
 31    # sig: str
 32    #
 33    # Notes
 34    # -----
 35    # A parameter spec is characterized by the triplet
 36    #     (dtype[single/double], ndim[int], c_contiguous[bool])
 37    def fmt(spec) -> str:
 38        dtype, ndim, c_contiguous = spec
 39
 40        _dtype_spec = {
 41            pxrt.Width.SINGLE: "float32",
 42            pxrt.Width.DOUBLE: "float64",
 43        }[pxrt.Width(dtype)]
 44
 45        dim_spec = [":"] * ndim
 46        if c_contiguous and (ndim > 0):
 47            dim_spec[-1] = "::1"
 48        dim_spec = "[" + ",".join(dim_spec) + "]"
 49
 50        _repr = _dtype_spec
 51        if ndim > 0:
 52            _repr += dim_spec
 53        return _repr
 54
 55    sig = "".join(
 56        [
 57            "void" if (returns is None) else fmt(returns),
 58            "(",
 59            ", ".join(map(fmt, params)),
 60            ")",
 61        ]
 62    )
 63    return sig
 64
 65

[docs]
 66class _Stencil:
 67    """
 68    Multi-dimensional JIT-compiled stencil. (Low-level function.)
 69
 70    This low-level class creates a gu-vectorized stencil applicable on multiple inputs simultaneously.
 71    Only NUMPY/CUPY arrays are accepted.
 72
 73    Create instances via factory method :py:meth:`~pyxu.operator._Stencil.init`.
 74
 75    Example
 76    -------
 77    Correlate a stack of images `A` with a (3, 3) kernel such that:
 78
 79    .. math::
 80
 81       B[n, m] = A[n-1, m] + A[n, m-1] + A[n, m+1] + A[n+1, m]
 82
 83    .. code-block:: python3
 84
 85       import numpy as np
 86       from pyxu.operator import _Stencil
 87
 88       # create the stencil
 89       kernel = np.array([[0, 1, 0],
 90                          [1, 0, 1],
 91                          [0, 1, 0]], dtype=np.float64)
 92       center = (1, 1)
 93       stencil = _Stencil.init(kernel, center)
 94
 95       # apply it to the data
 96       rng = np.random.default_rng()
 97       A = rng.normal(size=(2, 3, 4, 30, 30))  # 24 images of size (30, 30)
 98       B = np.zeros_like(A)
 99       stencil.apply(A, B)  # (2, 3, 4, 30, 30)
100    """
101
102    IndexSpec = cabc.Sequence[pxt.Integer]
103

[docs]
104    @staticmethod
105    def init(
106        kernel: pxt.NDArray,
107        center: IndexSpec,
108    ):
109        """
110        Parameters
111        ----------
112        kernel: NDArray
113            (k1,...,kD) kernel coefficients.
114
115            Only float32/64 kernels are supported.
116        center: ~pyxu.operator._Stencil.IndexSpec
117            (D,) index of the kernel's center.
118
119        Returns
120        -------
121        st: ~pyxu.operator._Stencil
122            Rank-D stencil.
123        """
124        dtype = kernel.dtype
125        if dtype not in {_.value for _ in pxrt.Width}:
126            raise ValueError(f"Unsupported kernel precision {dtype}.")
127
128        center = np.array(center, dtype=int)
129        assert center.size == kernel.ndim
130        assert np.all((0 <= center) & (center < kernel.shape))
131
132        N = pxd.NDArrayInfo
133        ndi = N.from_obj(kernel)
134        if ndi == N.NUMPY:
135            klass = _Stencil_NP
136        elif ndi == N.CUPY:
137            klass = _Stencil_CP
138        else:
139            raise NotImplementedError
140
141        st = klass(kernel, center)
142        return st

143

[docs]
144    def apply(
145        self,
146        arr: pxt.NDArray,
147        out: pxt.NDArray,
148        **kwargs,
149    ) -> pxt.NDArray:
150        r"""
151        Evaluate stencil on multiple inputs.
152
153        Parameters
154        ----------
155        arr: NDArray
156            (..., M1,...,MD) data to process.
157        out: NDArray
158            (..., M1,...,MD) array to which outputs are written.
159        kwargs: dict
160            Extra kwargs to configure `f_jit()`, the Dispatcher instance created by Numba.
161
162            Only relevant for GPU stencils, with values:
163
164            * blockspergrid: int
165            * threadsperblock: int
166
167            Default values are chosen if unspecified.
168
169        Returns
170        -------
171        out: NDArray
172            (..., M1,...,MD) outputs.
173
174        Notes
175        -----
176        * `arr` and `out` must have the same type/dtype as the kernel used during instantiation.
177        * Index regions in `out` where the stencil is not fully supported are set to 0.
178        * :py:meth:`~pyxu.operator._Stencil.apply` may raise ``CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES`` when the number of
179          GPU registers required exceeds resource limits.  There are 2 solutions to this problem:
180
181            (1) Pass the `max_registers` kwarg to f_jit()'s decorator; or
182            (2) `Limit the number of threads per block <https://stackoverflow.com/a/68659008>`_.
183
184          (1) must be set at compile time; it is thus left unbounded.
185          (2) is accessible through .apply(\*\*kwargs).
186        """
187        assert arr.dtype == out.dtype == self._kernel.dtype
188        assert arr.shape == out.shape
189        assert arr.flags.c_contiguous and out.flags.c_contiguous
190
191        K_dim = len(self._kernel.shape)
192        dim_shape = arr.shape[-K_dim:]
193
194        stencil = self._configure_dispatcher(arr.size, **kwargs)
195        stencil(
196            # OK since NP/CP input constraint.
197            arr.reshape(-1, *dim_shape),
198            out.reshape(-1, *dim_shape),
199        )
200        return out

201
202    def __init__(
203        self,
204        kernel: pxt.NDArray,
205        center: pxt.NDArray,
206    ):
207        self._kernel = kernel
208        self._center = center
209
210        cached_module = self._gen_code()
211        self._dispatch = cached_module.f_jit
212
213    def _gen_code(self) -> types.ModuleType:
214        # Compile Numba kernel `void f_jit(arr, out)`.
215        #
216        # The code is compiled only if unavailable beforehand.
217        #
218        # Returns
219        # -------
220        # jit_module: module
221        #     A (loaded) python package containing method f_jit().
222        raise NotImplementedError
223
224    def _configure_dispatcher(self, pb_size: int, **kwargs) -> cabc.Callable:
225        # Configure `f_jit()`, the Numba Dispatcher instance.
226        #
227        # Parameters
228        # ----------
229        # pb_size: int
230        #     Number of stencil evaluations.
231        # **kwargs: dict
232        #
233        # Returns
234        # -------
235        # f: callable
236        #     Configured Numba Dispatcher.
237        raise NotImplementedError

238
239
240class _Stencil_NP(_Stencil):
241    def _gen_code(self) -> types.ModuleType:
242        # Generate the code which should be compiled --------------------------
243        sig_spec = (self._kernel.dtype, self._kernel.ndim + 1, True)
244        signature = _signature((sig_spec,) * 2, None)
245
246        template_file = plib.Path(__file__).parent / "_template_cpu.txt"
247        with open(template_file, mode="r") as f:
248            template = string.Template(f.read())
249        code = template.substitute(
250            signature=signature,
251            stencil_spec=self.__stencil_spec(),
252        )
253        # ---------------------------------------------------------------------
254
255        # Store/update cached version as needed.
256        module_name = pxu.cache_module(code)
257        pxcfg.cache_dir(load=True)  # make the Pyxu cache importable (if not already done)
258        jit_module = pxu.import_module(module_name)
259        return jit_module
260
261    def _configure_dispatcher(self, pb_size: int, **kwargs) -> cabc.Callable:
262        # Nothing to do for CPU targets.
263        return self._dispatch
264
265    def __stencil_spec(self) -> str:
266        f_fmt = {  # coef float-formatter
267            pxrt.Width.SINGLE: "1.8e",
268            pxrt.Width.DOUBLE: "1.16e",
269        }[pxrt.Width(self._kernel.dtype)]
270
271        entry = []
272        _range = list(map(range, self._kernel.shape))
273        for idx in itertools.product(*_range):
274            idx_c = [i - c for (i, c) in zip(idx, self._center)]
275            idx_c = ",".join(map(str, idx_c))
276
277            cst = self._kernel[idx]
278            if np.isclose(cst, 0):
279                # no useless look-ups at runtime
280                e = None
281            elif np.isclose(cst, 1):
282                # no multiplication required
283                e = f"a[0,{idx_c}]"
284            else:
285                # general case
286                e = f"({cst:{f_fmt}} * a[0,{idx_c}])"
287
288            if e is not None:
289                entry.append(e)
290
291        spec = " + ".join(entry)
292        return spec
293
294
295class _Stencil_CP(_Stencil):
296    def _gen_code(self) -> types.ModuleType:
297        # Generate the code which should be compiled --------------------------
298        sig_spec = (self._kernel.dtype, self._kernel.ndim + 1, True)
299        signature = _signature((sig_spec,) * 2, None)
300
301        template_file = plib.Path(__file__).parent / "_template_gpu.txt"
302        with open(template_file, mode="r") as f:
303            template = string.Template(f.read())
304        code = template.substitute(
305            kernel_center=str(tuple(self._center.tolist())),
306            kernel_width=str(self._kernel.shape),
307            signature=signature,
308            stencil_spec=self.__stencil_spec(),
309            unravel_spec=self.__unravel_spec(),
310        )
311        # ---------------------------------------------------------------------
312
313        # Store/update cached version as needed.
314        module_name = pxu.cache_module(code)
315        pxcfg.cache_dir(load=True)  # make the Pyxu cache importable (if not already done)
316        jit_module = pxu.import_module(module_name)
317        return jit_module
318
319    def _configure_dispatcher(self, pb_size: int, **kwargs) -> cabc.Callable:
320        # Set (`threadsperblock`, `blockspergrid`)
321        assert set(kwargs.keys()) <= {
322            "threadsperblock",
323            "blockspergrid",
324        }
325
326        attr = self._kernel.device.attributes
327        tpb = kwargs.get("threadsperblock", attr["MaxThreadsPerBlock"])
328        bpg = kwargs.get("blockspergrid", (pb_size // tpb) + 1)
329        return self._dispatch[bpg, tpb]
330
331    def __stencil_spec(self) -> str:
332        f_fmt = {  # coef float-formatter
333            pxrt.Width.SINGLE: "1.8e",
334            pxrt.Width.DOUBLE: "1.16e",
335        }[pxrt.Width(self._kernel.dtype)]
336
337        entry = []
338        _range = list(map(range, self._kernel.shape))
339        for idx in itertools.product(*_range):
340            # create string of form "idx[1]+i1,...,idx[K]+iK"
341            idx_c = [i - c for (i, c) in zip(idx, self._center)]
342            idx_c = [f"idx[{i1}]{i2:+d}" for (i1, i2) in enumerate(idx_c, start=1)]
343            idx_c = ",".join(idx_c)
344
345            cst = self._kernel[idx]
346            if np.isclose(cst, 0):
347                # no useless look-ups at runtime
348                e = None
349            elif np.isclose(cst, 1):
350                # no multiplication required
351                e = f"arr[idx[0],{idx_c}]"
352            else:
353                # general case
354                e = f"({cst:{f_fmt}} * arr[idx[0],{idx_c}])"
355
356            if e is not None:
357                entry.append(e)
358
359        spec = " + ".join(entry)
360        return spec
361
362    def __unravel_spec(self) -> str:
363        N = self._kernel.ndim + 1  # 1 stack-dim
364        entry = []
365
366        # left = offset
367        e = "left = offset"
368        entry.append(e)
369
370        # blk = prod(shape)
371        e = "blk = " + " * ".join([f"shape[{n}]" for n in range(N)])
372        entry.append(e)
373
374        for n in range(N):
375            # blk //= shape[n]
376            e = f"blk //= shape[{n}]"
377            entry.append(e)
378
379            # i{n} = left // blk
380            e = f"i{n} = left // blk"
381            entry.append(e)
382
383            # left -= i{n} * blk
384            e = f"left -= i{n} * blk"
385            entry.append(e)
386
387        # idx = (i0, ..., i{N})
388        e = "idx = (" + ", ".join([f"i{n}" for n in range(N)]) + ")"
389        entry.append(e)
390
391        # indent each entry by 4, then concatenate
392        for i in range(1, len(entry)):  # 1st line skipped
393            entry[i] = "    " + entry[i]
394        spec = "\n".join(entry)
395        return spec