-- | Various Second-Order Array Combinators that are operationally
-- parallel in a way that can be exploited by the compiler.
--
-- The functions here are all recognised specially by the compiler (or
-- built on those that are).  The asymptotic [work and
-- span](https://en.wikipedia.org/wiki/Analysis_of_parallel_algorithms)
-- is provided for each function, but note that this easily hides very
-- substantial constant factors.  For example, `scan`@term is *much*
-- slower than `reduce`@term, although they have the same asymptotic
-- complexity.
--
-- *Reminder on terminology*: A function `op` is said to be
-- *associative* if
--
--     (x `op` y) `op` z == x `op` (y `op` z)
--
-- for all `x`, `y`, `z`.  Similarly, it is *commutative* if
--
--     x `op` y == y `op` x
--
-- The value `o` is a *neutral element* if
--
--     x `op` o == o `op` x == x
--
-- for any `x`.

import "zip"

-- | Apply the given function to each element of an array.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(1)*
let map 'a [n] 'x (f: a -> x) (as: [n]a): *[n]x =
  intrinsics.map (f, as)

-- | Apply the given function to each element of a single array.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(1)*
let map1 'a [n] 'x (f: a -> x) (as: [n]a): *[n]x =
  map f as

-- | As `map1`@term, but with one more array.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(1)*
let map2 'a 'b [n] 'x (f: a -> b -> x) (as: [n]a) (bs: [n]b): *[n]x =
  map (\(a, b) -> f a b) (zip2 as bs)

-- | As `map2`@term, but with one more array.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(1)*
let map3 'a 'b 'c [n] 'x (f: a -> b -> c -> x) (as: [n]a) (bs: [n]b) (cs: [n]c): *[n]x =
  map (\(a, b, c) -> f a b c) (zip3 as bs cs)

-- | As `map3`@term, but with one more array.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(1)*
let map4 'a 'b 'c 'd [n] 'x (f: a -> b -> c -> d -> x) (as: [n]a) (bs: [n]b) (cs: [n]c) (ds: [n]d): *[n]x =
  map (\(a, b, c, d) -> f a b c d) (zip4 as bs cs ds)

-- | As `map4`@term, but with one more array.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(1)*
let map5 'a 'b 'c 'd 'e [n] 'x (f: a -> b -> c -> d -> e -> x) (as: [n]a) (bs: [n]b) (cs: [n]c) (ds: [n]d) (es: [n]e): *[n]x =
  map (\(a, b, c, d, e) -> f a b c d e) (zip5 as bs cs ds es)

-- | Reduce the array `as` with `op`, with `ne` as the neutral
-- element for `op`.  The function `op` must be associative.  If
-- it is not, the return value is unspecified.  If the value returned
-- by the operator is an array, it must have the exact same size as
-- the neutral element, and that must again have the same size as the
-- elements of the input array.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(log(n))*
let reduce 'a (op: a -> a -> a) (ne: a) (as: []a): a =
  intrinsics.reduce (op, ne, as)

-- | As `reduce`, but the operator must also be commutative.  This
-- is potentially faster than `reduce`.  For simple built-in
-- operators, like addition, the compiler already knows that the
-- operator is associative.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(log(n))*
let reduce_comm 'a (op: a -> a -> a) (ne: a) (as: []a): a =
  intrinsics.reduce_comm (op, ne, as)

-- | `reduce_by_index dest f ne is as` returns `dest`, but with each
-- element given by the indices of `is` updated by applying `f` to the
-- current value in `dest` and the corresponding value in `as`.  The
-- `ne` value must be a neutral element for `op`.  If `is` has
-- duplicates, `f` may be applied multiple times, and hence must be
-- associative and commutative.  Out-of-bounds indices in `is` are
-- ignored.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(n)* in the worst case (all updates to same position),
-- but *O(1)* in the best case.
--
-- In practice, the *O(n)* behaviour only occurs if *m* is also very
-- large.
let reduce_by_index 'a [m] [n] (dest : *[m]a) (f : a -> a -> a) (ne : a) (is : [n]i32) (as : [n]a) : *[m]a =
  intrinsics.gen_reduce (dest, f, ne, is, as)

-- | Inclusive prefix scan.  Has the same caveats with respect to
-- associativity as `reduce`.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(log(n))*
let scan [n] 'a (op: a -> a -> a) (ne: a) (as: [n]a): *[n]a =
  intrinsics.scan (op, ne, as)

-- | Remove all those elements of `as` that do not satisfy the
-- predicate `p`.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(log(n))*
let filter 'a (p: a -> bool) (as: []a): *[]a =
  let (as', is) = intrinsics.partition (1, \x -> if p x then 0 else 1, as)
  in as'[:is[0]]

-- | Split an array into those elements that satisfy the given
-- predicate, and those that do not.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(log(n))*
let partition [n] 'a (p: a -> bool) (as: [n]a): ([]a, []a) =
  let p' x = if p x then 0 else 1
  let (as', is) = intrinsics.partition (2, p', as)
  in (as'[0:is[0]], as'[is[0]:n])

-- | Split an array by two predicates, producing three arrays.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(log(n))*
let partition2 [n] 'a (p1: a -> bool) (p2: a -> bool) (as: [n]a): ([]a, []a, []a) =
  let p' x = if p1 x then 0 else if p2 x then 1 else 2
  let (as', is) = intrinsics.partition (3, p', as)
  in (as'[0:is[0]], as'[is[0]:is[0]+is[1]], as'[is[0]+is[1]:n])

-- | `stream_red op f as` splits `as` into chunks, applies `f` to each
-- of these in parallel, and uses `op` (which must be associative) to
-- combine the per-chunk results into a final result.  This SOAC is
-- useful when `f` can be given a particularly work-efficient
-- sequential implementation.  Operationally, we can imagine that `as`
-- is divided among as many threads as necessary to saturate the
-- machine, with each thread operating sequentially.
--
-- A chunk may be empty, `f []` must produce the neutral element for
-- `op`.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(log(n))*
let stream_red 'a 'b (op: b -> b -> b) (f: []a -> b) (as: []a): b =
  intrinsics.stream_red (op, f, as)

-- | As `stream_red`@term, but the chunks do not necessarily
-- correspond to subsequences of the original array (they may be
-- interleaved).
--
-- **Work:** *O(n)*
--
-- **Span:** *O(log(n))*
let stream_red_per 'a 'b (op: b -> b -> b) (f: []a -> b) (as: []a): b =
  intrinsics.stream_red_per (op, f, as)

-- | Similar to `stream_red`@term, except that each chunk must produce
-- an array *of the same size*.  The per-chunk results are
-- concatenated.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(1)*
let stream_map 'a 'b (f: []a -> []b) (as: []a): *[]b =
  intrinsics.stream_map (f, as)

-- | Similar to `stream_map`@term, but the chunks do not necessarily
-- correspond to subsequences of the original array (they may be
-- interleaved).
--
-- **Work:** *O(n)*
--
-- **Span:** *O(1)*
let stream_map_per 'a 'b (f: []a -> []b) (as: []a): *[]b =
  intrinsics.stream_map_per (f, as)

-- | Return `true` if the given function returns `true` for all
-- elements in the array.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(log(n))*
let all 'a (f: a -> bool) (as: []a): bool =
  reduce (&&) true (map f as)

-- | Return `true` if the given function returns `true` for any
-- elements in the array.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(log(n))*
let any 'a (f: a -> bool) (as: []a): bool =
  reduce (||) false (map f as)

-- | The `scatter as is vs` expression calculates the equivalent of
-- this imperative code:
--
-- ```
-- for index in 0..length is-1:
--   i = is[index]
--   v = vs[index]
--   as[i] = v
-- ```
--
-- The `is` and `vs` arrays must have the same outer size.  `scatter`
-- acts in-place and consumes the `as` array, returning a new array
-- that has the same type and elements as `as`, except for the indices
-- in `is`.  If `is` contains duplicates (i.e. several writes are
-- performed to the same location), the result is unspecified.  It is
-- not guaranteed that one of the duplicate writes will complete
-- atomically - they may be interleaved.  See `reduce_by_index`@term
-- for a function that can handle this case deterministically.
--
-- This is technically not a second-order operation, but it is defined
-- here because it is closely related to the SOACs.
--
-- **Work:** *O(n)*
--
-- **Span:** *O(1)*
let scatter 't [m] [n] (dest: *[m]t) (is: [n]i32) (vs: [n]t): *[m]t =
  intrinsics.scatter (dest, is, vs)