/*=============================================================================

    This file is part of FLINT.

    FLINT is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    FLINT is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with FLINT; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA

=============================================================================*/
/******************************************************************************

    Copyright (C) 2010 William Hart
    Copyright (C) 2011 Fredrik Johansson
    Copyright (C) 2011 Sebastian Pancratz
    Copyright (C) 2014 Ashish Kedia

******************************************************************************/

*******************************************************************************

    Helper functions

*******************************************************************************

int signed_mpn_sub_n(mp_ptr res, mp_srcptr op1, mp_srcptr op2, slong n)

    If \code{op1 >= op2} return 0 and set \code{res} to \code{op1 - op2}
    else return 1 and set \code{res} to \code{op2 - op1}.

*******************************************************************************

    Memory management

*******************************************************************************

void nmod_poly_init(nmod_poly_t poly, mp_limb_t n)

    Initialises \code{poly}. It will have coefficients modulo~$n$.

void nmod_poly_init_preinv(nmod_poly_t poly, mp_limb_t n, mp_limb_t ninv)

    Initialises \code{poly}. It will have coefficients modulo~$n$.
    The caller supplies a precomputed inverse limb generated by
    \code{n_preinvert_limb()}.

void nmod_poly_init2(nmod_poly_t poly, mp_limb_t n, slong alloc)

    Initialises \code{poly}. It will have coefficients modulo~$n$.
    Up to \code{alloc} coefficients may be stored in \code{poly}.

void nmod_poly_init2_preinv(nmod_poly_t poly,
                       mp_limb_t n, mp_limb_t ninv, slong alloc)

    Initialises \code{poly}. It will have coefficients modulo~$n$.
    The caller supplies a precomputed inverse limb generated by
    \code{n_preinvert_limb()}. Up to \code{alloc} coefficients may
    be stored in \code{poly}.

void nmod_poly_realloc(nmod_poly_t poly, slong alloc)

    Reallocates \code{poly} to the given length. If the current
    length is less than \code{alloc}, the polynomial is truncated
    and normalised.  If \code{alloc} is zero, the polynomial is
    cleared.

void nmod_poly_clear(nmod_poly_t poly)

    Clears the polynomial and releases any memory it used. The polynomial
    cannot be used again until it is initialised.

void nmod_poly_fit_length(nmod_poly_t poly, slong alloc)

    Ensures \code{poly} has space for at least \code{alloc} coefficients.
    This function only ever grows the allocated space, so no data loss can
    occur.

void _nmod_poly_normalise(nmod_poly_t poly)

    Internal function for normalising a polynomial so that the top
    coefficient, if there is one at all, is not zero.

*******************************************************************************

    Polynomial properties

*******************************************************************************

slong nmod_poly_length(const nmod_poly_t poly)

    Returns the length of the polynomial \code{poly}. The zero polynomial
    has length zero.

slong nmod_poly_degree(const nmod_poly_t poly)

    Returns the degree of the polynomial \code{poly}. The zero polynomial
    is deemed to have degree~$-1$.

mp_limb_t nmod_poly_modulus(const nmod_poly_t poly)

    Returns the modulus of the polynomial \code{poly}. This will be a
    positive integer.

mp_bitcnt_t nmod_poly_max_bits(const nmod_poly_t poly)

    Returns the maximum number of bits of any coefficient of \code{poly}.

*******************************************************************************

    Assignment and basic manipulation

*******************************************************************************

void nmod_poly_set(nmod_poly_t a, const nmod_poly_t b)

    Sets \code{a} to a copy of \code{b}.

void nmod_poly_swap(nmod_poly_t poly1, nmod_poly_t poly2)

    Efficiently swaps \code{poly1} and \code{poly2} by swapping pointers
    internally.

void nmod_poly_zero(nmod_poly_t res)

    Sets \code{res} to the zero polynomial.

void nmod_poly_truncate(nmod_poly_t poly, slong len)

    Truncates \code{poly} to the given length and normalises it.
    If \code{len} is greater than the current length of \code{poly},
    then nothing happens.

void _nmod_poly_reverse(mp_ptr output, mp_srcptr input, slong len, slong m)

    Sets \code{output} to the reverse of \code{input}, which is of length
    \code{len}, but thinking of it as a polynomial of length~\code{m},
    notionally zero-padded if necessary. The length~\code{m} must be
    non-negative, but there are no other restrictions. The polynomial
    \code{output} must have space for \code{m} coefficients.

void nmod_poly_reverse(nmod_poly_t output, const nmod_poly_t input, slong m)

    Sets \code{output} to the reverse of \code{input}, thinking of it as
    a polynomial of length~\code{m}, notionally zero-padded if necessary).
    The length~\code{m} must be non-negative, but there are no other
    restrictions. The output polynomial will be set to length~\code{m}
    and then normalised.

*******************************************************************************

    Randomization

*******************************************************************************

void nmod_poly_randtest(nmod_poly_t poly, flint_rand_t state, slong len)

    Generates a random polynomial with length up to \code{len}.

void
nmod_poly_randtest_irreducible(nmod_poly_t poly, flint_rand_t state, slong len)

    Generates a random irreducible polynomial with length up to \code{len}.

void nmod_poly_randtest_monic(nmod_poly_t poly, flint_rand_t state, slong len)

    Generates a random monic polynomial with length \code{len}.

void
nmod_poly_randtest_monic_irreducible(nmod_poly_t poly, flint_rand_t state,
                                     slong len)

    Generates a random monic irreducible polynomial with length \code{len}.


void
nmod_poly_randtest_trinomial(nmod_poly_t poly, flint_rand_t state, slong len)

    Generates a random monic trinomial of length \code{len}.

int
nmod_poly_randtest_trinomial_irreducible(nmod_poly_t poly, flint_rand_t state,
                                         slong len, slong max_attempts)

    Attempts to set \code{poly} to a monic irreducible trinomial of
    length \code{len}.  It will generate up to \code{max_attempts}
    trinomials in attempt to find an irreducible one.  If
    \code{max_attempts} is \code{0}, then it will keep generating
    trinomials until an irreducible one is found.  Returns $1$ if one
    is found and $0$ otherwise.

void
nmod_poly_randtest_pentomial(nmod_poly_t poly, flint_rand_t state, slong len)

    Generates a random monic pentomial of length \code{len}.

int
nmod_poly_randtest_pentomial_irreducible(nmod_poly_t poly, flint_rand_t state,
                                         slong len, slong max_attempts)

    Attempts to set \code{poly} to a monic irreducible pentomial of
    length \code{len}.  It will generate up to \code{max_attempts}
    pentomials in attempt to find an irreducible one.  If
    \code{max_attempts} is \code{0}, then it will keep generating
    pentomials until an irreducible one is found.  Returns $1$ if one
    is found and $0$ otherwise.

void
nmod_poly_randtest_sparse_irreducible(nmod_poly_t poly, flint_rand_t state,
                                      slong len)

    Attempts to set \code{poly} to a sparse, monic irreducible polynomial
    with length \code{len}.  It attempts to find an irreducible
    trinomial.  If that does not succeed, it attempts to find a
    irreducible pentomial.  If that fails, then \code{poly} is just
    set to a random monic irreducible polynomial.

*******************************************************************************

    Getting and setting coefficients

*******************************************************************************

ulong nmod_poly_get_coeff_ui(const nmod_poly_t poly, slong j)

    Returns the coefficient of \code{poly} at index~\code{j}, where
    coefficients are numbered with zero being the constant coefficient,
    and returns it as an \code{ulong}. If \code{j} refers to a
    coefficient beyond the end of \code{poly}, zero is returned.

void nmod_poly_set_coeff_ui(nmod_poly_t poly, slong j, ulong c)

    Sets the coefficient of \code{poly} at index \code{j}, where
    coefficients are numbered with zero being the constant coefficient,
    to the value \code{c} reduced modulo the modulus of \code{poly}.
    If \code{j} refers to a coefficient beyond the current end of \code{poly},
    the polynomial is first resized, with intervening coefficients being
    set to zero.

*******************************************************************************

    Input and output

*******************************************************************************

char * nmod_poly_get_str(const nmod_poly_t poly)

    Writes \code{poly} to a string representation. The format is as
    described for \code{nmod_poly_print()}. The string must be freed by the
    user when finished. For this it is sufficient to call \code{flint_free()}.

char * nmod_poly_get_str_pretty(const nmod_poly_t poly, const char * x)

    Writes \code{poly} to a pretty string representation. The format is as
    described for \code{nmod_poly_print_pretty()}. The string must be freed
    by the user when finished. For this it is sufficient to call
    \code{flint_free()}.

    It is assumed that the top coefficient is non-zero.

int nmod_poly_set_str(nmod_poly_t poly, const char * s)

    Reads \code{poly} from a string \code{s}. The format is as described
    for \code{nmod_poly_print()}. If a polynomial in the correct format
    is read, a positive value is returned, otherwise a non-positive value
    is returned.

int nmod_poly_print(const nmod_poly_t a)

    Prints the polynomial to \code{stdout}. The length is printed,
    followed by a space, then the modulus. If the length is zero this is
    all that is printed, otherwise two spaces followed by a space
    separated list of coefficients is printed, beginning with the constant
    coefficient.

    In case of success, returns a positive value.  In case of failure,
    returns a non-positive value.

int nmod_poly_print_pretty(const nmod_poly_t a, const char * x)

    Prints the polynomial to \code{stdout} using the string \code{x} to
    represent the indeterminate.

    It is assumed that the top coefficient is non-zero.

    In case of success, returns a positive value.  In case of failure,
    returns a non-positive value.

int nmod_poly_fread(FILE * f, nmod_poly_t poly)

    Reads \code{poly} from the file stream \code{f}. If this is a file
    that has just been written, the file should be closed then opened
    again. The format is as described for \code{nmod_poly_print()}. If a
    polynomial in the correct format is read, a positive value is returned,
    otherwise a non-positive value is returned.

int nmod_poly_fprint(FILE * f, const nmod_poly_t poly)

    Writes a polynomial to the file stream \code{f}. If this is a file
    then the file should be closed and reopened before being read.
    The format is as described for \code{nmod_poly_print()}. If the
    polynomial is written correctly, a positive value is returned,
    otherwise a non-positive value is returned.

    In case of success, returns a positive value.  In case of failure,
    returns a non-positive value.

int nmod_poly_fprint_pretty(FILE * f, const nmod_poly_t poly, const char * x)

    Writes a polynomial to the file stream \code{f}. If this is a file
    then the file should be closed and reopened before being read.
    The format is as described for \code{nmod_poly_print_pretty()}. If the
    polynomial is written correctly, a positive value is returned,
    otherwise a non-positive value is returned.

    It is assumed that the top coefficient is non-zero.

    In case of success, returns a positive value.  In case of failure,
    returns a non-positive value.

int nmod_poly_read(nmod_poly_t poly)

    Read \code{poly} from \code{stdin}. The format is as described for
    \code{nmod_poly_print()}. If a polynomial in the correct format is read, a
    positive value is returned, otherwise a non-positive value is returned.

*******************************************************************************

    Comparison

*******************************************************************************

int nmod_poly_equal(const nmod_poly_t a, const nmod_poly_t b)

    Returns~$1$ if the polynomials are equal, otherwise~$0$.

int nmod_poly_is_zero(const nmod_poly_t poly)

    Returns~$1$ if the polynomial \code{poly} is the zero polynomial,
    otherwise returns~$0$.

int nmod_poly_is_one(const nmod_poly_t poly)

    Returns~$1$ if the polynomial \code{poly} is the constant polynomial 1,
    otherwise returns~$0$.

*******************************************************************************

    Shifting

*******************************************************************************

void _nmod_poly_shift_left(mp_ptr res, mp_srcptr poly, slong len, slong k)

    Sets \code{(res, len + k)} to \code{(poly, len)} shifted left by
    \code{k} coefficients. Assumes that \code{res} has space for
    \code{len + k} coefficients.

void nmod_poly_shift_left(nmod_poly_t res, const nmod_poly_t poly, slong k)

    Sets \code{res} to \code{poly} shifted left by \code{k} coefficients,
    i.e.\ multiplied by $x^k$.

void _nmod_poly_shift_right(mp_ptr res, mp_srcptr poly, slong len, slong k)

    Sets \code{(res, len - k)} to \code{(poly, len)} shifted left by
    \code{k} coefficients. It is assumed that \code{k <= len} and that
    \code{res} has space for at least \code{len - k} coefficients.

void nmod_poly_shift_right(nmod_poly_t res, const nmod_poly_t poly, slong k)

    Sets \code{res} to \code{poly} shifted right by \code{k} coefficients,
    i.e.\ divide by $x^k$ and throws away the remainder. If \code{k} is
    greater than or equal to the length of \code{poly}, the result is the
    zero polynomial.

*******************************************************************************

    Addition and subtraction

*******************************************************************************

void _nmod_poly_add(mp_ptr res, mp_srcptr poly1, slong len1,
                         mp_srcptr poly2, slong len2, nmod_t mod)

    Sets \code{res} to the sum of \code{(poly1, len1)} and
    \code{(poly2, len2)}. There are no restrictions on the lengths.

void nmod_poly_add(nmod_poly_t res, const nmod_poly_t poly1,
                                            const nmod_poly_t poly2)

    Sets \code{res} to the sum of \code{poly1} and \code{poly2}.

void _nmod_poly_sub(mp_ptr res, mp_srcptr poly1, slong len1,
                         mp_srcptr poly2, slong len2, nmod_t mod)

    Sets \code{res} to the difference of \code{(poly1, len1)} and
    \code{(poly2, len2)}. There are no restrictions on the lengths.

void nmod_poly_sub(nmod_poly_t res, const nmod_poly_t poly1,
                                    const nmod_poly_t poly2)

    Sets \code{res} to the difference of \code{poly1} and \code{poly2}.

void nmod_poly_neg(nmod_poly_t res, const nmod_poly_t poly)

    Sets \code{res} to the negation of \code{poly}.

*******************************************************************************

    Scalar multiplication and division

*******************************************************************************

void nmod_poly_scalar_mul_nmod(nmod_poly_t res,
                          const nmod_poly_t poly, ulong c)

    Sets \code{res} to \code{(poly, len)} multiplied by~$c$,
    where~$c$ is reduced modulo the modulus of \code{poly}.

void _nmod_poly_make_monic(mp_ptr output,
                                      mp_srcptr input, slong len, nmod_t mod)

    Sets \code{output} to be the scalar multiple of \code{input} of
    length \code{len > 0} that has leading coefficient one, if such a
    polynomial exists. If the leading coefficient of \code{input} is not
    invertible, \code{output} is set to the multiple of \code{input} whose
    leading coefficient is the greatest common divisor of the leading
    coefficient and the modulus of \code{input}.

void nmod_poly_make_monic(nmod_poly_t output, const nmod_poly_t input)

    Sets \code{output} to be the scalar multiple of \code{input} with leading
    coefficient one, if such a polynomial exists. If \code{input} is zero
    an exception is raised. If the leading coefficient of \code{input} is not
    invertible, \code{output} is set to the multiple of \code{input} whose
    leading coefficient is the greatest common divisor of the leading
    coefficient and the modulus of \code{input}.

*******************************************************************************

    Bit packing and unpacking

*******************************************************************************

void _nmod_poly_bit_pack(mp_ptr res, mp_srcptr poly, slong len,
                                                     mp_bitcnt_t bits)

    Packs \code{len} coefficients of \code{poly} into fields of the given
    number of bits in the large integer \code{res}, i.e.\ evaluates
    \code{poly} at \code{2^bits} and store the result in \code{res}.
    Assumes \code{len > 0} and \code{bits > 0}. Also assumes that no
    coefficient of \code{poly} is bigger than \code{bits/2} bits. We
    also assume \code{bits < 3 * FLINT_BITS}.

void _nmod_poly_bit_unpack(mp_ptr res, slong len,
                                mp_srcptr mpn, ulong bits, nmod_t mod)

    Unpacks \code{len} coefficients stored in the big integer \code{mpn}
    in bit fields of the given number of bits, reduces them modulo the
    given modulus, then stores them in the polynomial \code{res}.
    We assume \code{len > 0} and \code{3 * FLINT_BITS > bits > 0}.
    There are no restrictions on the size of the actual coefficients as
    stored within the bitfields.

void nmod_poly_bit_pack(fmpz_t f, const nmod_poly_t poly, mp_bitcnt_t bit_size)

    Packs \code{poly} into bitfields of size \code{bit_size}, writing the
    result to \code{f}.

void nmod_poly_bit_unpack(nmod_poly_t poly, const fmpz_t f,
        mp_bitcnt_t bit_size)

    Unpacks the polynomial from fields of size \code{bit_size} as
    represented by the integer \code{f}.


void _nmod_poly_KS2_pack1(mp_ptr res, mp_srcptr op, slong n, slong s,
                ulong b, ulong k, slong r)

    Same as \code{_nmod_poly_KS2_pack}, but requires \code{b <= FLINT_BITS}.

void _nmod_poly_KS2_pack(mp_ptr res, mp_srcptr op, slong n, slong s,
               ulong b, ulong k, slong r)

    Bit packing routine used by KS2 and KS4 multiplication.

void _nmod_poly_KS2_unpack1(mp_ptr res, mp_srcptr op, slong n, ulong b,
                  ulong k)

    Same as \code{_nmod_poly_KS2_unpack}, but requires \code{b <= FLINT_BITS}
    (i.e. writes one word per coefficient).

void _nmod_poly_KS2_unpack2(mp_ptr res, mp_srcptr op, slong n, ulong b,
                  ulong k)

    Same as \code{_nmod_poly_KS2_unpack}, but requires
    \code{FLINT_BITS < b <= 2 * FLINT_BITS} (i.e. writes two words per
    coefficient).

void _nmod_poly_KS2_unpack3(mp_ptr res, mp_srcptr op, slong n, ulong b,
                  ulong k)

    Same as \code{_nmod_poly_KS2_unpack}, but requires
    \code{2 * FLINT_BITS < b < 3 * FLINT_BITS} (i.e. writes three words per
    coefficient).

void _nmod_poly_KS2_unpack(mp_ptr res, mp_srcptr op, slong n, ulong b,
                 ulong k)

    Bit unpacking code used by KS2 and KS4 multiplication.


*******************************************************************************

    KS2/KS4 Reduction

*******************************************************************************

void _nmod_poly_KS2_reduce(mp_ptr res, slong s, mp_srcptr op, slong n, ulong w,
                     nmod_t mod)

    Reduction code used by KS2 and KS4 multiplication.

void _nmod_poly_KS2_recover_reduce1(mp_ptr res, slong s, mp_srcptr op1,
                          mp_srcptr op2, slong n, ulong b, nmod_t mod)

    Same as \code{_nmod_poly_KS2_recover_reduce}, but requires
    \code{0 < 2 * b <= FLINT_BITS}.

void _nmod_poly_KS2_recover_reduce2(mp_ptr res, slong s, mp_srcptr op1,
                          mp_srcptr op2, slong n, ulong b, nmod_t mod)

    Same as \code{_nmod_poly_KS2_recover_reduce}, but requires
    \code{FLINT_BITS < 2 * b < 2*FLINT_BITS}.

void _nmod_poly_KS2_recover_reduce2b(mp_ptr res, slong s, mp_srcptr op1,
                          mp_srcptr op2, slong n, ulong b, nmod_t mod)

    Same as \code{_nmod_poly_KS2_recover_reduce}, but requires
    \code{b == FLINT_BITS}.

void _nmod_poly_KS2_recover_reduce3(mp_ptr res, slong s, mp_srcptr op1,
                          mp_srcptr op2, slong n, ulong b, nmod_t mod)

    Same as \code{_nmod_poly_KS2_recover_reduce}, but requires
    \code{2 * FLINT_BITS < 2 * b <= 3 * FLINT_BITS}.

void _nmod_poly_KS2_recover_reduce(mp_ptr res, slong s, mp_srcptr op1,
                          mp_srcptr op2, slong n, ulong b, nmod_t mod)

    Reduction code used by KS4 multiplication.


*******************************************************************************

    Multiplication

*******************************************************************************

void _nmod_poly_mul_classical(mp_ptr res, mp_srcptr poly1,
                    slong len1, mp_srcptr poly2, slong len2, nmod_t mod)

    Sets \code{(res, len1 + len2 - 1)} to the product of \code{(poly1, len1)}
    and \code{(poly2, len2)}. Assumes \code{len1 >= len2 > 0}. Aliasing of
    inputs and output is not permitted.

void nmod_poly_mul_classical(nmod_poly_t res,
                             const nmod_poly_t poly1, const nmod_poly_t poly2)

    Sets \code{res} to the product of \code{poly1} and \code{poly2}.

void _nmod_poly_mullow_classical(mp_ptr res, mp_srcptr poly1, slong len1,
                          mp_srcptr poly2, slong len2, slong trunc, nmod_t mod)

    Sets \code{res} to the lower \code{trunc} coefficients of the product of
    \code{(poly1, len1)} and \code{(poly2, len2)}. Assumes that
    \code{len1 >= len2 > 0} and \code{trunc > 0}. Aliasing of inputs and
    output is not permitted.

void nmod_poly_mullow_classical(nmod_poly_t res,
                 const nmod_poly_t poly1, const nmod_poly_t poly2, slong trunc)

    Sets \code{res} to the lower \code{trunc} coefficients of the product
    of \code{poly1} and \code{poly2}.

void _nmod_poly_mulhigh_classical(mp_ptr res, mp_srcptr poly1,
            slong len1, mp_srcptr poly2, slong len2, slong start, nmod_t mod)

    Computes the product of \code{(poly1, len1)} and \code{(poly2, len2)}
    and writes the coefficients from \code{start} onwards into the high
    coefficients of \code{res}, the remaining coefficients being arbitrary
    but reduced.  Assumes that \code{len1 >= len2 > 0}. Aliasing of inputs
    and output is not permitted.

void nmod_poly_mulhigh_classical(nmod_poly_t res,
                 const nmod_poly_t poly1, const nmod_poly_t poly2, slong start)

    Computes the product of \code{poly1} and \code{poly2} and writes the
    coefficients from \code{start} onwards into the high coefficients of
    \code{res}, the remaining coefficients being arbitrary but reduced.

void _nmod_poly_mul_KS(mp_ptr out, mp_srcptr in1, slong len1,
                     mp_srcptr in2, slong len2, mp_bitcnt_t bits, nmod_t mod)

    Sets \code{res} to the product of \code{in1} and \code{in2}
    assuming the output coefficients are at most the given number of
    bits wide. If \code{bits} is set to $0$ an appropriate value is
    computed automatically.  Assumes that \code{len1 >= len2 > 0}.

void nmod_poly_mul_KS(nmod_poly_t res,
            const nmod_poly_t poly1, const nmod_poly_t poly2, mp_bitcnt_t bits)

    Sets \code{res} to the product of \code{poly1} and \code{poly2}
    assuming the output coefficients are at most the given number of
    bits wide. If \code{bits} is set to $0$ an appropriate value
    is computed automatically.

void _nmod_poly_mul_KS2(mp_ptr res, mp_srcptr op1, slong n1,
                  mp_srcptr op2, slong n2, nmod_t mod)

    Sets \code{res} to the product of \code{op1} and \code{op2}.
    Assumes that \code{len1 >= len2 > 0}.

void nmod_poly_mul_KS2(nmod_poly_t res,
                 const nmod_poly_t poly1, const nmod_poly_t poly2)

    Sets \code{res} to the product of \code{poly1} and \code{poly2}.

void _nmod_poly_mul_KS4(mp_ptr res, mp_srcptr op1, slong n1,
                  mp_srcptr op2, slong n2, nmod_t mod)

    Sets \code{res} to the product of \code{op1} and \code{op2}.
    Assumes that \code{len1 >= len2 > 0}.

void nmod_poly_mul_KS4(nmod_poly_t res,
                 const nmod_poly_t poly1, const nmod_poly_t poly2)

    Sets \code{res} to the product of \code{poly1} and \code{poly2}.

void _nmod_poly_mullow_KS(mp_ptr out, mp_srcptr in1, slong len1,
              mp_srcptr in2, slong len2, mp_bitcnt_t bits, slong n, nmod_t mod)

    Sets \code{out} to the low $n$ coefficients of \code{in1} of length
    \code{len1} times \code{in2} of length \code{len2}. The output must have
    space for \code{n} coefficients. We assume that \code{len1 >= len2 > 0}
    and that \code{0 < n <= len1 + len2 - 1}.

void nmod_poly_mullow_KS(nmod_poly_t res, const nmod_poly_t poly1,
                            const nmod_poly_t poly2, mp_bitcnt_t bits, slong n)

    Set \code{res} to the low $n$ coefficients of \code{in1} of length
    \code{len1} times \code{in2} of length \code{len2}.

void _nmod_poly_mul(mp_ptr res, mp_srcptr poly1, slong len1,
                                       mp_srcptr poly2, slong len2, nmod_t mod)

    Sets \code{res} to the product of \code{poly1} of length \code{len1}
    and \code{poly2} of length \code{len2}. Assumes \code{len1 >= len2 > 0}.
    No aliasing is permitted between the inputs and the output.

void nmod_poly_mul(nmod_poly_t res,
                               const nmod_poly_t poly, const nmod_poly_t poly2)

    Sets \code{res} to the product of \code{poly1} and \code{poly2}.

void _nmod_poly_mullow(mp_ptr res, mp_srcptr poly1, slong len1,
                              mp_srcptr poly2, slong len2, slong n, nmod_t mod)

    Sets \code{res} to the first \code{n} coefficients of the
    product of \code{poly1} of length \code{len1} and \code{poly2} of
    length \code{len2}. It is assumed that \code{0 < n <= len1 + len2 - 1}
    and that \code{len1 >= len2 > 0}. No aliasing of inputs and output
    is permitted.

void nmod_poly_mullow(nmod_poly_t res, const nmod_poly_t poly1,
                                          const nmod_poly_t poly2, slong trunc)

    Sets \code{res} to the first \code{trunc} coefficients of the
    product of \code{poly1} and \code{poly2}.

void _nmod_poly_mulhigh(mp_ptr res, mp_srcptr poly1, slong len1,
                              mp_srcptr poly2, slong len2, slong n, nmod_t mod)

    Sets all but the low $n$ coefficients of \code{res} to the
    corresponding coefficients of the product of \code{poly1} of length
    \code{len1} and \code{poly2} of length \code{len2}, the other
    coefficients being arbitrary. It is assumed that
    \code{len1 >= len2 > 0} and that \code{0 < n <= len1 + len2 - 1}.
    Aliasing of inputs and output is not permitted.

void nmod_poly_mulhigh(nmod_poly_t res, const nmod_poly_t poly1,
                                          const nmod_poly_t poly2, slong n)

    Sets all but the low $n$ coefficients of \code{res} to the
    corresponding coefficients of the product of \code{poly1} and
    \code{poly2}, the remaining coefficients being arbitrary.

void _nmod_poly_mulmod(mp_ptr res, mp_srcptr poly1, slong len1,
                             mp_srcptr poly2, slong len2, mp_srcptr f,
                            slong lenf, nmod_t mod)

    Sets \code{res} to the remainder of the product of \code{poly1} and
    \code{poly2} upon polynomial division by \code{f}.

    It is required that \code{len1 + len2 - lenf > 0}, which is equivalent
    to requiring that the result will actually be reduced. Otherwise, simply
    use \code{_nmod_poly_mul} instead.

    Aliasing of \code{f} and \code{res} is not permitted.

void nmod_poly_mulmod(nmod_poly_t res,
    const nmod_poly_t poly1, const nmod_poly_t poly2, const nmod_poly_t f)

    Sets \code{res} to the remainder of the product of \code{poly1} and
    \code{poly2} upon polynomial division by \code{f}.

void _nmod_poly_mulmod_preinv(mp_ptr res, mp_srcptr poly1, slong len1,
                          mp_srcptr poly2, slong len2, mp_srcptr f,
                         slong lenf, mp_srcptr finv, slong lenfinv, nmod_t mod)

    Sets \code{res} to the remainder of the product of \code{poly1} and
    \code{poly2} upon polynomial division by \code{f}.

    It is required that \code{finv} is the inverse of the reverse of \code{f}
    mod \code{x^lenf}. It is required that \code{len1 + len2 - lenf > 0},
    which is equivalent to requiring that the result will actually be reduced.
    It is required that \code{len1 < lenf} and \code{len2 < lenf}.
    Otherwise, simply use \code{_nmod_poly_mul} instead.

    Aliasing of \code{f} or \code{finv} and \code{res} is not permitted.

void nmod_poly_mulmod_preinv(nmod_poly_t res,
    const nmod_poly_t poly1, const nmod_poly_t poly2, const nmod_poly_t f,
    const nmod_poly_t finv)

    Sets \code{res} to the remainder of the product of \code{poly1} and
    \code{poly2} upon polynomial division by \code{f}. \code{finv} is the
    inverse of the reverse of \code{f}. It is required that \code{poly1} and
    \code{poly2} are reduced modulo \code{f}.

*******************************************************************************

    Powering

*******************************************************************************

void _nmod_poly_pow_binexp(mp_ptr res,
                             mp_srcptr poly, slong len, ulong e, nmod_t mod)

    Raises \code{poly} of length \code{len} to the power \code{e} and sets
    \code{res} to the result. We require that \code{res} has enough space
    for \code{(len - 1)*e + 1} coefficients. Assumes that \code{len > 0},
    \code{e > 1}. Aliasing is not permitted. Uses the binary exponentiation
    method.

void nmod_poly_pow_binexp(nmod_poly_t res, const nmod_poly_t poly, ulong e)

    Raises \code{poly} to the power \code{e} and sets \code{res} to the
    result. Uses the binary exponentiation method.

void _nmod_poly_pow(mp_ptr res,
                             mp_srcptr poly, slong len, ulong e, nmod_t mod)

    Raises \code{poly} of length \code{len} to the power \code{e} and sets
    \code{res} to the result. We require that \code{res} has enough space
    for \code{(len - 1)*e + 1} coefficients. Assumes that \code{len > 0},
    \code{e > 1}. Aliasing is not permitted.

void nmod_poly_pow(nmod_poly_t res, const nmod_poly_t poly, ulong e)

    Raises \code{poly} to the power \code{e} and sets \code{res} to the
    result.

void _nmod_poly_pow_trunc_binexp(mp_ptr res, mp_srcptr poly,
                                           ulong e, slong trunc, nmod_t mod)

    Sets \code{res} to the low \code{trunc} coefficients of \code{poly}
    (assumed to be zero padded if necessary to length \code{trunc}) to
    the power \code{e}. This is equivalent to doing a powering followed
    by a truncation. We require that \code{res} has enough space for
    \code{trunc} coefficients, that \code{trunc > 0} and that
    \code{e > 1}. Aliasing is not permitted. Uses the binary
    exponentiation method.

void nmod_poly_pow_trunc_binexp(nmod_poly_t res,
                               const nmod_poly_t poly, ulong e, slong trunc)

    Sets \code{res} to the low \code{trunc} coefficients of \code{poly}
    to the power \code{e}. This is equivalent to doing a powering
    followed by a truncation. Uses the binary exponentiation method.

void _nmod_poly_pow_trunc(mp_ptr res, mp_srcptr poly,
                                           ulong e, slong trunc, nmod_t mod)

    Sets \code{res} to the low \code{trunc} coefficients of \code{poly}
    (assumed to be zero padded if necessary to length \code{trunc}) to
    the power \code{e}. This is equivalent to doing a powering followed
    by a truncation. We require that \code{res} has enough space for
    \code{trunc} coefficients, that \code{trunc > 0} and that
    \code{e > 1}. Aliasing is not permitted.

void nmod_poly_pow_trunc(nmod_poly_t res,
                               const nmod_poly_t poly, ulong e, slong trunc)

    Sets \code{res} to the low \code{trunc} coefficients of \code{poly}
    to the power \code{e}. This is equivalent to doing a powering
    followed by a truncation.

void _nmod_poly_powmod_ui_binexp(mp_ptr res, mp_srcptr poly,
                                ulong e, mp_srcptr f,
                                slong lenf, nmod_t mod)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e > 0}.

    We require \code{lenf > 1}. It is assumed that \code{poly} is already
    reduced modulo \code{f} and zero-padded as necessary to have length
    exactly \code{lenf - 1}. The output \code{res} must have room for
    \code{lenf - 1} coefficients.

void nmod_poly_powmod_ui_binexp(nmod_poly_t res,
                           const nmod_poly_t poly, ulong e,
                           const nmod_poly_t f)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e >= 0}.

void
_nmod_poly_powmod_ui_binexp_preinv (mp_ptr res, mp_srcptr poly,
                                    ulong e, mp_srcptr f, slong lenf,
                                    mp_srcptr finv, slong lenfinv, nmod_t mod)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e > 0}.
    We require \code{finv} to be the inverse of the reverse of \code{f}.

    We require \code{lenf > 1}. It is assumed that \code{poly} is already
    reduced modulo \code{f} and zero-padded as necessary to have length
    exactly \code{lenf - 1}. The output \code{res} must have room for
    \code{lenf - 1} coefficients.

void
nmod_poly_powmod_ui_binexp_preinv(nmod_poly_t res,
                           const nmod_poly_t poly, ulong e,
                           const nmod_poly_t f, const nmod_poly_t finv)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e >= 0}.
    We require \code{finv} to be the inverse of the reverse of \code{f}.

void
_nmod_poly_powmod_x_ui_preinv (mp_ptr res, ulong e, mp_srcptr f, slong lenf,
                               mp_srcptr finv, slong lenfinv, nmod_t mod)

    Sets \code{res} to \code{x} raised to the power \code{e} modulo \code{f},
    using sliding window exponentiation. We require \code{e > 0}.
    We require \code{finv} to be the inverse of the reverse of \code{f}.

    We require \code{lenf > 2}. The output \code{res} must have room for
    \code{lenf - 1} coefficients.

void
nmod_poly_powmod_x_ui_preinv(nmod_poly_t res, ulong e, const nmod_poly_t f,
                             const nmod_poly_t finv)

    Sets \code{res} to \code{x} raised to the power \code{e}
    modulo \code{f}, using sliding window exponentiation. We require
    \code{e >= 0}. We require \code{finv} to be the inverse of the reverse of
    \code{f}.

void _nmod_poly_powmod_mpz_binexp(mp_ptr res, mp_srcptr poly,
                                mpz_srcptr e, mp_srcptr f,
                                slong lenf, nmod_t mod)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e > 0}.

    We require \code{lenf > 1}. It is assumed that \code{poly} is already
    reduced modulo \code{f} and zero-padded as necessary to have length
    exactly \code{lenf - 1}. The output \code{res} must have room for
    \code{lenf - 1} coefficients.

void nmod_poly_powmod_mpz_binexp(nmod_poly_t res,
                           const nmod_poly_t poly, mpz_srcptr e,
                           const nmod_poly_t f)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e >= 0}.

void
_nmod_poly_powmod_mpz_binexp_preinv (mp_ptr res, mp_srcptr poly,
                                    mpz_srcptr e, mp_srcptr f, slong lenf,
                                    mp_srcptr finv, slong lenfinv, nmod_t mod)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e > 0}.
    We require \code{finv} to be the inverse of the reverse of \code{f}.

    We require \code{lenf > 1}. It is assumed that \code{poly} is already
    reduced modulo \code{f} and zero-padded as necessary to have length
    exactly \code{lenf - 1}. The output \code{res} must have room for
    \code{lenf - 1} coefficients.

void
nmod_poly_powmod_mpz_binexp_preinv(nmod_poly_t res,
                           const nmod_poly_t poly, mpz_srcptr e,
                           const nmod_poly_t f, const nmod_poly_t finv)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e >= 0}.
    We require \code{finv} to be the inverse of the reverse of \code{f}.

*******************************************************************************

    Division

*******************************************************************************

void _nmod_poly_divrem_basecase(mp_ptr Q, mp_ptr R, mp_ptr W,
           mp_srcptr A, slong A_len, mp_srcptr B, slong B_len, nmod_t mod)

    Finds $Q$ and $R$ such that $A = B Q + R$ with $\len(R) < \len(B)$.
    If $\len(B) = 0$ an exception is raised. We require that \code{W}
    is temporary space of \code{NMOD_DIVREM_BC_ITCH(A_len, B_len, mod)}
    coefficients.

void nmod_poly_divrem_basecase(nmod_poly_t Q,
                       nmod_poly_t R, const nmod_poly_t A, const nmod_poly_t B)

    Finds $Q$ and $R$ such that $A = B Q + R$ with $\len(R) < \len(B)$.
    If $\len(B) = 0$ an exception is raised.

void _nmod_poly_div_basecase(mp_ptr Q, mp_ptr W, mp_srcptr A, slong A_len,
                                         mp_srcptr B, slong B_len, nmod_t mod);

    Notionally finds polynomials $Q$ and $R$ such that $A = B Q + R$ with
    $\len(R) < \len(B)$, but returns only \code{Q}. If $\len(B) = 0$ an
    exception is raised. We require that \code{W} is temporary space of
    \code{NMOD_DIV_BC_ITCH(A_len, B_len, mod)} coefficients.

void nmod_poly_div_basecase(nmod_poly_t Q, const nmod_poly_t A,
                                                          const nmod_poly_t B);

    Notionally finds polynomials $Q$ and $R$ such that $A = B Q + R$ with
    $\len(R) < \len(B)$, but returns only \code{Q}. If $\len(B) = 0$ an
    exception is raised.

void _nmod_poly_divrem_divconquer_recursive(mp_ptr Q, mp_ptr BQ, mp_ptr W,
                    mp_ptr V, mp_srcptr A, mp_srcptr B, slong lenB, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than
    \code{lenB}, where \code{A} is of length \code{2 * lenB - 1} and \code{B}
    is of length \code{lenB}. Sets \code{BQ} to the low \code{lenB - 1}
    coefficients of \code{B * Q}. We require that \code{Q} have space for
    \code{lenB} coefficients, that \code{W} be temporary space of size
    \code{lenB - 1} and \code{V} be temporary space for a number of
    coefficients computed by \code{NMOD_DIVREM_DC_ITCH(lenB, mod)}.

void _nmod_poly_divrem_divconquer(mp_ptr Q, mp_ptr R,
                  mp_srcptr A, slong lenA, mp_srcptr B, slong lenB, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than
    \code{lenB}, where \code{A} is of length \code{lenA} and \code{B} is of
    length \code{lenB}. We require that \code{Q} have space for
    \code{lenA - lenB + 1} coefficients.

void nmod_poly_divrem_divconquer(nmod_poly_t Q, nmod_poly_t R,
                                      const nmod_poly_t A, const nmod_poly_t B)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$.

void _nmod_poly_divrem_q0(mp_ptr Q, mp_ptr R,
                          mp_srcptr A, mp_srcptr B, slong lenA, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$,
    where $\len(A) = \len(B) > 0$.

    Requires that $Q$ and $R$ have space for $1$ and $\len(B) - 1$
    coefficients, respectively.

    Does not support aliasing or zero-padding.

void _nmod_poly_divrem_q1(mp_ptr Q, mp_ptr R,
                          mp_srcptr A, slong lenA, mp_srcptr B, slong lenB,
                          nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$,
    where $\len(A) = \len(B) + 1 \geq \len(B) > 0$.

    Requires that $Q$ and $R$ have space for $\len(A) - \len(B) + 1$ and
    $\len(B) - 1$ coefficients, respectively.

    Does not support aliasing or zero-padding.

void _nmod_poly_divrem(mp_ptr Q, mp_ptr R,
                  mp_srcptr A, slong lenA, mp_srcptr B, slong lenB, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than
    \code{lenB}, where \code{A} is of length \code{lenA} and \code{B} is of
    length \code{lenB}. We require that \code{Q} have space for
    \code{lenA - lenB + 1} coefficients.

void nmod_poly_divrem(nmod_poly_t Q, nmod_poly_t R,
                                      const nmod_poly_t A, const nmod_poly_t B)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$.

void _nmod_poly_div_divconquer_recursive(mp_ptr Q, mp_ptr W, mp_ptr V,
                              mp_srcptr A, mp_srcptr B, slong lenB, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than
    \code{lenB}, where \code{A} is of length \code{2 * lenB - 1} and \code{B}
    is of length \code{lenB}. We require that \code{Q} have space for
    \code{lenB} coefficients and that \code{W} be temporary space of size
    \code{lenB - 1} and \code{V} be temporary space for a number of
    coefficients computed by \code{NMOD_DIV_DC_ITCH(lenB, mod)}.

void _nmod_poly_div_divconquer(mp_ptr Q, mp_srcptr A, slong lenA,
                                           mp_srcptr B, slong lenB, nmod_t mod)

    Notionally computes polynomials $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R)$ less than \code{lenB}, where \code{A} is of length \code{lenA}
    and \code{B} is of length \code{lenB}, but returns only \code{Q}. We
    require that \code{Q} have space for \code{lenA - lenB + 1} coefficients.

void nmod_poly_div_divconquer(nmod_poly_t Q,
                                      const nmod_poly_t A, const nmod_poly_t B)

    Notionally computes $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R) < \len(B)$, but returns only $Q$.

void _nmod_poly_div(mp_ptr Q, mp_srcptr A, slong lenA,
                                           mp_srcptr B, slong lenB, nmod_t mod)

    Notionally computes polynomials $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R)$ less than \code{lenB}, where \code{A} is of length \code{lenA}
    and \code{B} is of length \code{lenB}, but returns only \code{Q}. We
    require that \code{Q} have space for \code{lenA - lenB + 1} coefficients.


void nmod_poly_div(nmod_poly_t Q, const nmod_poly_t A, const nmod_poly_t B)

    Computes the quotient $Q$ on polynomial division of $A$ and $B$.

void _nmod_poly_rem_basecase(mp_ptr R, mp_ptr W, mp_srcptr A, slong lenA,
                                       mp_srcptr B, slong lenB, nmod_t mod)

void nmod_poly_rem_basecase(nmod_poly_t R,
                            const nmod_poly_t A, const nmod_poly_t B)

void _nmod_poly_rem_q1(mp_ptr R,
                       mp_srcptr A, slong lenA, mp_srcptr B, slong lenB,
                       nmod_t mod)

    Notationally, computes $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R) < \len(B)$, where $\len(A) = \len(B) + 1 \geq \len(B) > 0$,
    but returns only the remainder.

    Requires that $R$ has space for $\len(B) - 1$ coefficients,
    respectively.

    Does not support aliasing or zero-padding.

void _nmod_poly_rem(mp_ptr R, mp_srcptr A, slong lenA,
                              mp_srcptr B, slong lenB, nmod_t mod)

    Computes the remainder $R$ on polynomial division of $A$ by $B$.

void nmod_poly_rem(nmod_poly_t R, const nmod_poly_t A, const nmod_poly_t B)

    Computes the remainder $R$ on polynomial division of $A$ by $B$.

void _nmod_poly_inv_series_basecase(mp_ptr Qinv,
                                    mp_srcptr Q, slong n, nmod_t mod)

    Given \code{Q} of length \code{n} whose leading coefficient is invertible
    modulo the given modulus, finds a polynomial \code{Qinv} of length \code{n}
    such that the top \code{n} coefficients of the product \code{Q * Qinv} is
    $x^{n - 1}$. Requires that \code{n > 0}. This function can be viewed as
    inverting a power series.

void nmod_poly_inv_series_basecase(nmod_poly_t Qinv,
                                   const nmod_poly_t Q, slong n)

    Given \code{Q} of length at least \code{n} find \code{Qinv} of length
    \code{n} such that the top \code{n} coefficients of the product
    \code{Q * Qinv} is $x^{n - 1}$. An exception is raised if \code{n = 0}
    or if the length of \code{Q} is less than \code{n}. The leading
    coefficient of \code{Q} must be invertible modulo the modulus of
    \code{Q}. This function can be viewed as inverting a power series.

void
_nmod_poly_inv_series_newton(mp_ptr Qinv, mp_srcptr Q, slong n, nmod_t mod)

    Given \code{Q} of length \code{n} whose constant coefficient is invertible
    modulo the given modulus, find a polynomial \code{Qinv} of length \code{n}
    such that \code{Q * Qinv} is \code{1} modulo $x^n$. Requires \code{n > 0}.
    This function can be viewed as inverting a power series via Newton
    iteration.

void
nmod_poly_inv_series_newton(nmod_poly_t Qinv, const nmod_poly_t Q, slong n)

    Given \code{Q} find \code{Qinv} such that \code{Q * Qinv} is \code{1}
    modulo $x^n$. The constant coefficient of \code{Q} must be invertible
    modulo the modulus of \code{Q}. An exception is raised if this is not
    the case or if \code{n = 0}. This function can be viewed as inverting
    a power series via Newton iteration.

void _nmod_poly_inv_series(mp_ptr Qinv, mp_srcptr Q, slong n, nmod_t mod)

    Given \code{Q} of length \code{n} whose constant coefficient is invertible
    modulo the given modulus, find a polynomial \code{Qinv} of length \code{n}
    such that \code{Q * Qinv} is \code{1} modulo $x^n$. Requires \code{n > 0}.
    This function can be viewed as inverting a power series.

void nmod_poly_inv_series(nmod_poly_t Qinv, const nmod_poly_t Q, slong n)

    Given \code{Q} find \code{Qinv} such that \code{Q * Qinv} is \code{1}
    modulo $x^n$. The constant coefficient of \code{Q} must be invertible
    modulo the modulus of \code{Q}. An exception is raised if this is not
    the case or if \code{n = 0}. This function can be viewed as inverting
    a power series.

void _nmod_poly_div_series(mp_ptr Q, mp_srcptr A, mp_srcptr B,
                                                  slong n, nmod_t mod)

    Given polynomials \code{A} and \code{B} of length \code{n}, finds the
    polynomial \code{Q} of length \code{n} such that \code{Q * B = A}
    modulo $x^n$. We assume \code{n > 0} and that the constant coefficient
    of \code{B} is invertible modulo the given modulus. The polynomial
    \code{Q} must have space for \code{n} coefficients.

void nmod_poly_div_series(nmod_poly_t Q, const nmod_poly_t A,
                                         const nmod_poly_t B, slong n)

    Given polynomials \code{A} and \code{B} considered modulo \code{n},
    finds the polynomial \code{Q} of length at most \code{n} such that
    \code{Q * B = A} modulo $x^n$. We assume \code{n > 0} and that the
    constant coefficient of \code{B} is invertible modulo the modulus.
    An exception is raised if \code{n == 0} or the constant coefficient
    of \code{B} is zero.

void _nmod_poly_div_newton(mp_ptr Q, mp_srcptr A, slong Alen,
                                     mp_srcptr B, slong Blen, nmod_t mod)

    Notionally computes polynomials $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R)$ less than \code{lenB}, where \code{A} is of length \code{lenA}
    and \code{B} is of length \code{lenB}, but return only $Q$.

    We require that $Q$ have space for \code{lenA - lenB + 1} coefficients
    and assume that the leading coefficient of $B$ is a unit.

    The algorithm used is to reverse the polynomials and divide the
    resulting power series, then reverse the result.

void nmod_poly_div_newton(nmod_poly_t Q, const nmod_poly_t A,
                                         const nmod_poly_t B)

    Notionally computes $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R) < \len(B)$, but returns only $Q$.

    We assume that the leading coefficient of $B$ is a unit.

    The algorithm used is to reverse the polynomials and divide the
    resulting power series, then reverse the result.

void _nmod_poly_div_newton_n_preinv (mp_ptr Q, mp_srcptr A, slong lenA,
            mp_srcptr B, slong lenB, mp_srcptr Binv, slong lenBinv, nmod_t mod)

    Notionally computes polynomials $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R)$ less than \code{lenB}, where \code{A} is of length \code{lenA}
    and \code{B} is of length \code{lenB}, but return only $Q$.

    We require that $Q$ have space for \code{lenA - lenB + 1} coefficients
    and assume that the leading coefficient of $B$ is a unit. Furthermore, we
    assume that $Binv$ is the inverse of the reverse of $B$ mod $x^{\len(B)}$.

    The algorithm used is to reverse the polynomials and divide the
    resulting power series, then reverse the result.

void nmod_poly_div_newton_n_preinv (nmod_poly_t Q, const nmod_poly_t A,
                                 const nmod_poly_t B, const nmod_poly_t Binv)

    Notionally computes $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R) < \len(B)$, but returns only $Q$.

    We assume that the leading coefficient of $B$ is a unit and that $Binv$ is
    the inverse of the reverse of $B$ mod $x^{\len(B)}$.

    It is required that the length of $A$ is less than or equal to
    2*the length of $B$ - 2.

    The algorithm used is to reverse the polynomials and divide the
    resulting power series, then reverse the result.

void _nmod_poly_divrem_newton(mp_ptr Q, mp_ptr R, mp_srcptr A, slong Alen,
                                        mp_srcptr B, slong Blen, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than
    \code{lenB}, where $A$ is of length \code{lenA} and $B$ is of length
    \code{lenB}. We require that $Q$ have space for \code{lenA - lenB + 1}
    coefficients. The algorithm used is to call \code{div_newton()} and then
    multiply out and compute the remainder.

void nmod_poly_divrem_newton(nmod_poly_t Q, nmod_poly_t R,
                             const nmod_poly_t A, const nmod_poly_t B)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$.
    The algorithm used is to call \code{div_newton()} and then multiply out
    and compute the remainder.

void _nmod_poly_divrem_newton_n_preinv (mp_ptr Q, mp_ptr R, mp_srcptr A,
slong lenA, mp_srcptr B, slong lenB, mp_srcptr Binv, slong lenBinv, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than
    \code{lenB}, where $A$ is of length \code{lenA} and $B$ is of length
    \code{lenB}. We require that $Q$ have space for \code{lenA - lenB + 1}
    coefficients. Furthermore, we assume that $Binv$ is the inverse of the
    reverse of $B$ mod $x^{\len(B)}$. The algorithm used is to call
    \code{div_newton_n_preinv()} and then multiply out and compute
    the remainder.

void nmod_poly_divrem_newton_n_preinv(nmod_poly_t Q, nmod_poly_t R,
            const nmod_poly_t A, const nmod_poly_t B, const nmod_poly_t Binv)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$.
    We assume $Binv$ is the inverse of the reverse of $B$ mod $x^{\len(B)}$.

    It is required that the length of $A$ is less than or equal to
    2*the length of $B$ - 2.

    The algorithm used is to call \code{div_newton_n()} and then multiply out
    and compute the remainder.

mp_limb_t _nmod_poly_div_root(mp_ptr Q, mp_srcptr A, slong len,
                                mp_limb_t c, nmod_t mod)

    Sets \code{(Q, len-1)} to the quotient of \code{(A, len)} on division
    by $(x - c)$, and returns the remainder, equal to the value of $A$
    evaluated at $c$. $A$ and $Q$ are allowed to be the same, but may
    not overlap partially in any other way.

mp_limb_t nmod_poly_div_root(nmod_poly_t Q, const nmod_poly_t A, mp_limb_t c)

    Sets $Q$ to the quotient of $A$ on division by $(x - c)$, and returns
    the remainder, equal to the value of $A$ evaluated at $c$.

*******************************************************************************

    Derivative and integral

*******************************************************************************

void _nmod_poly_derivative(mp_ptr x_prime, mp_srcptr x, slong len, nmod_t mod)

    Sets the first \code{len - 1} coefficients of \code{x_prime} to the
    derivative of \code{x} which is assumed to be of length \code{len}.
    It is assumed that \code{len > 0}.

void nmod_poly_derivative(nmod_poly_t x_prime, const nmod_poly_t x)

    Sets \code{x_prime} to the derivative of \code{x}.

void _nmod_poly_integral(mp_ptr x_int, mp_srcptr x, slong len, nmod_t mod)

    Set the first \code{len} coefficients of \code{x_int} to the
    integral of \code{x} which is assumed to be of length \code{len - 1}.
    The constant term of \code{x_int} is set to zero.
    It is assumed that \code{len > 0}. The result is only well-defined
    if the modulus is a prime number strictly larger than the degree of
    \code{x}.

void nmod_poly_integral(nmod_poly_t x_int, const nmod_poly_t x)

    Set \code{x_int} to the indefinite integral of \code{x} with constant
    term zero. The result is only well-defined if the modulus
    is a prime number strictly larger than the degree of \code{x}.


*******************************************************************************

    Evaluation

*******************************************************************************

mp_limb_t _nmod_poly_evaluate_nmod(mp_srcptr poly, slong len, mp_limb_t c,
                                   nmod_t mod)

    Evaluates \code{poly} at the value~\code{c} and reduces modulo the
    given modulus of \code{poly}. The value~\code{c} should be reduced
    modulo the modulus. The algorithm used is Horner's method.

mp_limb_t nmod_poly_evaluate_nmod(nmod_poly_t poly, mp_limb_t c)

    Evaluates \code{poly} at the value~\code{c} and reduces modulo the
    modulus of \code{poly}. The value~\code{c} should be reduced modulo
    the modulus. The algorithm used is Horner's method.

void nmod_poly_evaluate_mat_horner(nmod_mat_t dest, const nmod_poly_t poly,
						const nmod_mat_t c)

    Evaluates \code{poly} with matrix as an argument at the value \code{c}
    and stores the result in \code{dest}. The dimension and modulus of
    \code{dest} is assumed to be same as that of \code{c}. \code{dest} and 
    \code{c} may be aliased. Horner's Method is used to compute the result.

void nmod_poly_evaluate_mat_paterson_stockmeyer(nmod_mat_t dest,
                        const nmod_poly_t poly, const nmod_mat_t c)

    Evaluates \code{poly} with matrix as an argument at the value \code{c}
    and stores the result in \code{dest}. The dimension and modulus of
    \code{dest} is assumed to be same as that of \code{c}. \code{dest} and 
    \code{c} may be aliased. Paterson-Stockmeyer algorithm is used to compute
    the result. The algorithm is described in \cite{Paterson1973}.

void nmod_poly_evaluate_mat(nmod_mat_t dest, const nmod_poly_t poly,
                        const nmod_mat_t c)

    Evaluates \code{poly} with matrix as an argument at the value \code{c}
    and stores the result in \code{dest}. The dimension and modulus of
    \code{dest} is assumed to be same as that of \code{c}. \code{dest} and 
    \code{c} may be aliased. This function automatically switches between
    Horner's method and the Paterson-Stockmeyer algorithm.

*******************************************************************************

    Multipoint evaluation

*******************************************************************************

void _nmod_poly_evaluate_nmod_vec_iter(mp_ptr ys, mp_srcptr poly, slong len,
                                    mp_srcptr xs, slong n, nmod_t mod)

    Evaluates (\code{coeffs}, \code{len}) at the \code{n} values
    given in the vector \code{xs}, writing the output values
    to \code{ys}. The values in \code{xs} should be reduced
    modulo the modulus.

    Uses Horner's method iteratively.

void nmod_poly_evaluate_nmod_vec_iter(mp_ptr ys, const nmod_poly_t poly,
                                    mp_srcptr xs, slong n)

    Evaluates \code{poly} at the \code{n} values given in the vector
    \code{xs}, writing the output values to \code{ys}. The values in
    \code{xs} should be reduced modulo the modulus.

    Uses Horner's method iteratively.

void _nmod_poly_evaluate_nmod_vec_fast_precomp(mp_ptr vs, mp_srcptr poly,
    slong plen, const mp_ptr * tree, slong len, nmod_t mod)

    Evaluates (\code{poly}, \code{plen}) at the \code{len} values given
    by the precomputed subproduct tree \code{tree}.

void _nmod_poly_evaluate_nmod_vec_fast(mp_ptr ys, mp_srcptr poly,
        slong len, mp_srcptr xs, slong n, nmod_t mod)

    Evaluates (\code{coeffs}, \code{len}) at the \code{n} values
    given in the vector \code{xs}, writing the output values
    to \code{ys}. The values in \code{xs} should be reduced
    modulo the modulus.

    Uses fast multipoint evaluation, building a temporary subproduct tree.

void nmod_poly_evaluate_nmod_vec_fast(mp_ptr ys, const nmod_poly_t poly,
                                    mp_srcptr xs, slong n)

    Evaluates \code{poly} at the \code{n} values given in the vector
    \code{xs}, writing the output values to \code{ys}. The values in
    \code{xs} should be reduced modulo the modulus.

    Uses fast multipoint evaluation, building a temporary subproduct tree.


void _nmod_poly_evaluate_nmod_vec(mp_ptr ys, mp_srcptr poly, slong len,
                                    mp_srcptr xs, slong n, nmod_t mod)

    Evaluates (\code{poly}, \code{len}) at the \code{n} values
    given in the vector \code{xs}, writing the output values
    to \code{ys}. The values in \code{xs} should be reduced
    modulo the modulus.

void nmod_poly_evaluate_nmod_vec(mp_ptr ys, const nmod_poly_t poly,
                                    mp_srcptr xs, slong n)

    Evaluates \code{poly} at the \code{n} values given in the vector
    \code{xs}, writing the output values to \code{ys}. The values in
    \code{xs} should be reduced modulo the modulus.

*******************************************************************************

    Interpolation

*******************************************************************************

void _nmod_poly_interpolate_nmod_vec(mp_ptr poly,
                            mp_srcptr xs, mp_srcptr ys, slong n, nmod_t mod)

    Sets \code{poly} to the unique polynomial of length at most \code{n}
    that interpolates the \code{n} given evaluation points \code{xs} and
    values \code{ys}. If the interpolating polynomial is shorter than
    length \code{n}, the leading coefficients are set to zero.

    The values in \code{xs} and \code{ys} should be reduced modulo the
    modulus, and all \code{xs} must be distinct. Aliasing between
    \code{poly} and \code{xs} or \code{ys} is not allowed.

void nmod_poly_interpolate_nmod_vec(nmod_poly_t poly,
                                    mp_srcptr xs, mp_srcptr ys, slong n)

    Sets \code{poly} to the unique polynomial of length \code{n} that
    interpolates the \code{n} given evaluation points \code{xs} and
    values \code{ys}. The values in \code{xs} and \code{ys} should be
    reduced modulo the modulus, and all \code{xs} must be distinct.

void _nmod_poly_interpolation_weights(mp_ptr w, const mp_ptr * tree,
        slong len, nmod_t mod)

    Sets \code{w} to the barycentric interpolation weights for fast
    Lagrange interpolation with respect to a given subproduct tree.

void _nmod_poly_interpolate_nmod_vec_fast_precomp(mp_ptr poly, mp_srcptr ys,
    const mp_ptr * tree, mp_srcptr weights, slong len, nmod_t mod)

    Performs interpolation using the fast Lagrange interpolation
    algorithm, generating a temporary subproduct tree.

    The function values are given as \code{ys}. The function takes
    a precomputed subproduct tree \code{tree} and barycentric
    interpolation weights \code{weights} corresponding to the
    roots.

void _nmod_poly_interpolate_nmod_vec_fast(mp_ptr poly,
                            mp_srcptr xs, mp_srcptr ys, slong n, nmod_t mod)

    Performs interpolation using the fast Lagrange interpolation
    algorithm, generating a temporary subproduct tree.

void nmod_poly_interpolate_nmod_vec_fast(nmod_poly_t poly,
                                    mp_srcptr xs, mp_srcptr ys, slong n)

    Performs interpolation using the fast Lagrange interpolation algorithm,
    generating a temporary subproduct tree.

void _nmod_poly_interpolate_nmod_vec_newton(mp_ptr poly,
                            mp_srcptr xs, mp_srcptr ys, slong n, nmod_t mod)

    Forms the interpolating polynomial in the Newton basis using
    the method of divided differences and then converts it to
    monomial form.

void nmod_poly_interpolate_nmod_vec_newton(nmod_poly_t poly,
                                    mp_srcptr xs, mp_srcptr ys, slong n)

    Forms the interpolating polynomial in the Newton basis using
    the method of divided differences and then converts it to
    monomial form.

void _nmod_poly_interpolate_nmod_vec_barycentric(mp_ptr poly,
                            mp_srcptr xs, mp_srcptr ys, slong n, nmod_t mod)

    Forms the interpolating polynomial using a naive implementation
    of the barycentric form of Lagrange interpolation.

void nmod_poly_interpolate_nmod_vec_barycentric(nmod_poly_t poly,
                                    mp_srcptr xs, mp_srcptr ys, slong n)

    Forms the interpolating polynomial using a naive implementation
    of the barycentric form of Lagrange interpolation.


*******************************************************************************

    Composition

*******************************************************************************

void _nmod_poly_compose_horner(mp_ptr res, mp_srcptr poly1, slong len1,
                               mp_srcptr poly2, slong len2, nmod_t mod)

    Composes \code{poly1} of length \code{len1} with \code{poly2} of length
    \code{len2} and sets \code{res} to the result, i.e.\ evaluates
    \code{poly1} at \code{poly2}. The algorithm used is Horner's algorithm.
    We require that \code{res} have space for \code{(len1 - 1)*(len2 - 1) + 1}
    coefficients. It is assumed that \code{len1 > 0} and \code{len2 > 0}.

void nmod_poly_compose_horner(nmod_poly_t res,
                              const nmod_poly_t poly1, const nmod_poly_t poly2)

    Composes \code{poly1} with \code{poly2} and sets \code{res} to the result,
    i.e.\ evaluates \code{poly1} at \code{poly2}. The algorithm used is
    Horner's algorithm.

void _nmod_poly_compose_divconquer(mp_ptr res, mp_srcptr poly1, slong len1,
                                   mp_srcptr poly2, slong len2, nmod_t mod)

    Composes \code{poly1} of length \code{len1} with \code{poly2} of length
    \code{len2} and sets \code{res} to the result, i.e.\ evaluates
    \code{poly1} at \code{poly2}. The algorithm used is the divide and
    conquer algorithm. We require that \code{res} have space for
    \code{(len1 - 1)*(len2 - 1) + 1} coefficients. It is assumed that
    \code{len1 > 0} and \code{len2 > 0}.

void nmod_poly_compose_divconquer(nmod_poly_t res,
                              const nmod_poly_t poly1, const nmod_poly_t poly2)

    Composes \code{poly1} with \code{poly2} and sets \code{res} to the result,
    i.e.\ evaluates \code{poly1} at \code{poly2}. The algorithm used is
    the divide and conquer algorithm.

void _nmod_poly_compose(mp_ptr res, mp_srcptr poly1, slong len1,
                                       mp_srcptr poly2, slong len2, nmod_t mod)

    Composes \code{poly1} of length \code{len1} with \code{poly2} of length
    \code{len2} and sets \code{res} to the result, i.e.\ evaluates \code{poly1}
    at \code{poly2}. We require that \code{res} have space for
    \code{(len1 - 1)*(len2 - 1) + 1} coefficients. It is assumed that
    \code{len1 > 0} and \code{len2 > 0}.

void nmod_poly_compose(nmod_poly_t res,
                              const nmod_poly_t poly1, const nmod_poly_t poly2)

    Composes \code{poly1} with \code{poly2} and sets \code{res} to the result,
    that is, evaluates \code{poly1} at \code{poly2}.

*******************************************************************************

    Taylor shift

*******************************************************************************

void _nmod_poly_taylor_shift_horner(mp_ptr poly, mp_limb_t c,
    slong len, nmod_t mod)

    Performs the Taylor shift composing \code{poly} by $x+c$ in-place.
    Uses an efficient version Horner's rule.

void nmod_poly_taylor_shift_horner(nmod_poly_t g,
    const nmod_poly_t f, mp_limb_t c)

    Performs the Taylor shift composing \code{f} by $x+c$.

void _nmod_poly_taylor_shift_convolution(mp_ptr poly, mp_limb_t c,
    slong len, nmod_t mod)

    Performs the Taylor shift composing \code{poly} by $x+c$ in-place.
    Writes the composition as a single convolution with cost $O(M(n))$.
    We require that the modulus is a prime at least as large as the length.

void nmod_poly_taylor_shift_convolution(nmod_poly_t g,
    const nmod_poly_t f, mp_limb_t c)

    Performs the Taylor shift composing \code{f} by $x+c$.
    Writes the composition as a single convolution with cost $O(M(n))$.
    We require that the modulus is a prime at least as large as the length.

void _nmod_poly_taylor_shift(mp_ptr poly, mp_limb_t c, slong len, nmod_t mod)

    Performs the Taylor shift composing \code{poly} by $x+c$ in-place.
    We require that the modulus is a prime.

void nmod_poly_taylor_shift(nmod_poly_t g, const nmod_poly_t f, mp_limb_t c)

    Performs the Taylor shift composing \code{f} by $x+c$.
    We require that the modulus is a prime.

*******************************************************************************

    Modular composition

*******************************************************************************

void _nmod_poly_compose_mod_horner(mp_ptr res,
    mp_srcptr f, slong lenf, mp_srcptr g, mp_srcptr h, slong lenh, nmod_t mod)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that the length of $g$ is one less than the
    length of $h$ (possibly with zero padding). The output is not allowed
    to be aliased with any of the inputs.

    The algorithm used is Horner's rule.

void nmod_poly_compose_mod_horner(nmod_poly_t res,
                    const nmod_poly_t f, const nmod_poly_t g,
                    const nmod_poly_t h)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero. The algorithm used is Horner's rule.


void _nmod_poly_compose_mod_brent_kung(mp_ptr res,
    mp_srcptr f, slong lenf, mp_srcptr g, mp_srcptr h, slong lenh, nmod_t mod)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that the length of $g$ is one less than the
    length of $h$ (possibly with zero padding). We also require that
    the length of $f$ is less than the length of $h$. The output is not allowed
    to be aliased with any of the inputs.

    The algorithm used is the Brent-Kung matrix algorithm.

void nmod_poly_compose_mod_brent_kung(nmod_poly_t res,
                    const nmod_poly_t f, const nmod_poly_t g,
                    const nmod_poly_t h)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that $f$ has smaller degree than $h$.
    The algorithm used is the Brent-Kung matrix algorithm.

void _nmod_poly_compose_mod_brent_kung_preinv(mp_ptr res, mp_srcptr f,
                            slong lenf,
                            mp_srcptr g, mp_srcptr h, slong lenh,
                            mp_srcptr hinv, slong lenhinv, nmod_t mod)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that the length of $g$ is one less than the
    length of $h$ (possibly with zero padding). We also require that
    the length of $f$ is less than the length of $h$. Furthermore, we require
    \code{hinv} to be the inverse of the reverse of \code{h}.
    The output is not allowed to be aliased with any of the inputs.

    The algorithm used is the Brent-Kung matrix algorithm.

void nmod_poly_compose_mod_brent_kung_preinv(nmod_poly_t res,
                    const nmod_poly_t f, const nmod_poly_t g,
                    const nmod_poly_t h, const nmod_poly_t hinv)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that $f$ has smaller degree than $h$. Furthermore,
    we require \code{hinv} to be the inverse of the reverse of \code{h}.
    The algorithm used is the Brent-Kung matrix algorithm.

void
_nmod_poly_reduce_matrix_mod_poly (nmod_mat_t A, const nmod_mat_t B,
                          const nmod_poly_t f)

    Sets the ith row of \code{A} to the reduction of the ith row of $B$ modulo
    $f$ for $i=1,\ldots,\sqrt{\deg(f)}$. We require $B$ to be at least
    a $\sqrt{\deg(f)}\times \deg(f)$ matrix and $f$ to be nonzero.

void *
_nmod_poly_precompute_matrix_worker (void * arg_ptr)

    Worker function version of \code{_nmod_poly_precompute_matrix}.
    Input/output is stored in \code{nmod_poly_matrix_precompute_arg_t}.

void
_nmod_poly_precompute_matrix (nmod_mat_t A, mp_srcptr f, mp_srcptr g,
               slong leng, mp_srcptr ginv, slong lenginv, nmod_t mod)

    Sets the ith row of \code{A} to $f^i$ modulo $g$ for
    $i=1,\ldots,\sqrt{\deg(g)}$. We require $A$ to be
    a $\sqrt{\deg(g)}\times \deg(g)$ matrix. We require
    \code{ginv} to be the inverse of the reverse of \code{g} and $g$ to be
    nonzero. \code{f} has to be reduced modulo \code{g} and of length one less
    than \code{leng} (possibly with zero padding).

void
nmod_poly_precompute_matrix (nmod_mat_t A, const nmod_poly_t f,
                          const nmod_poly_t g, const nmod_poly_t ginv)

    Sets the ith row of \code{A} to $f^i$ modulo $g$ for
    $i=1,\ldots,\sqrt{\deg(g)}$. We require $A$ to be
    a $\sqrt{\deg(g)}\times \deg(g)$ matrix. We require
    \code{ginv} to be the inverse of the reverse of \code{g}.

void *
_nmod_poly_compose_mod_brent_kung_precomp_preinv_worker(void * arg_ptr)

    Worker function version of
    \code{_nmod_poly_compose_mod_brent_kung_precomp_preinv}.
    Input/output is stored in
    \code{nmod_poly_compose_mod_precomp_preinv_arg_t}.

void
_nmod_poly_compose_mod_brent_kung_precomp_preinv(mp_ptr res, mp_srcptr f,
                            slong lenf, const nmod_mat_t A, mp_srcptr h,
                            slong lenh, mp_srcptr hinv, slong lenhinv,
                            nmod_t mod)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero. We require that the ith row of $A$ contains $g^i$ for
    $i=1,\ldots,\sqrt{\deg(h)}$, i.e. $A$ is a
    $\sqrt{\deg(h)}\times \deg(h)$ matrix. We also require that
    the length of $f$ is less than the length of $h$. Furthermore, we require
    \code{hinv} to be the inverse of the reverse of \code{h}.
    The output is not allowed to be aliased with any of the inputs.

    The algorithm used is the Brent-Kung matrix algorithm.

void
nmod_poly_compose_mod_brent_kung_precomp_preinv(nmod_poly_t res,
                    const nmod_poly_t f, const nmod_mat_t A,
                    const nmod_poly_t h, const nmod_poly_t hinv)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that the
    ith row of $A$ contains $g^i$ for $i=1,\ldots,\sqrt{\deg(h)}$, i.e. $A$ is a
    $\sqrt{\deg(h)}\times \deg(h)$ matrix. We require that $h$ is nonzero and
    that $f$ has smaller degree than $h$. Furthermore, we require \code{hinv} to
    be the inverse of the reverse of \code{h}. This version of Brent-Kung
    modular composition is particularly useful if one has to perform several
    modular composition of the form $f(g)$ modulo $h$ for fixed $g$ and $h$.

void
_nmod_poly_compose_mod_brent_kung_vec_preinv (nmod_poly_struct * res,
                 const nmod_poly_struct * polys, slong len1, slong l,
                 mp_srcptr h, slong lenh, mp_srcptr hinv,
                 slong lenhinv, nmod_t mod)

    Sets \code{res} to the composition $f_i(g)$ modulo $h$ for $1\leq i \leq l$,
    where $f_i$ are the first \code{l} elements of \code{polys} and $g$ is the
    last element of \code{polys}. We require that $h$ is nonzero and that the
    length of $g$ is less than the length of $h$. We also require that the
    length of $f_i$ is less than the length of $h$. We require \code{res} to
    have enough memory allocated to hold \code{l} \code{nmod_poly_struct}.
    The entries of \code{res} need to be initialised and \code{l} needs to be
    less than \code{len1} Furthermore, we require \code{hinv} to be the inverse
    of the reverse of \code{h}. The output is not allowed to be aliased with any
    of the inputs.

    The algorithm used is the Brent-Kung matrix algorithm.

void
nmod_poly_compose_mod_brent_kung_vec_preinv(nmod_poly_struct * res,
                    const nmod_poly_struct * polys, slong len1, slong n,
                    const nmod_poly_t h, const nmod_poly_t hinv)

    Sets \code{res} to the composition $f_i(g)$ modulo $h$ for $1\leq i \leq n$
    where $f_i$ are the first \code{n} elements of \code{polys} and $g$ is the
    last element of \code{polys}. We require \code{res} to have enough memory
    allocated to hold \code{n} \code{nmod_poly_struct}. The entries of
    \code{res} need to be uninitialised and \code{n} needs to be less than
    \code{len1}. We require that $h$ is nonzero and that $f_i$ and $g$ have
    smaller degree than $h$. Furthermore, we require \code{hinv} to be the
    inverse of the reverse of \code{h}. No aliasing of \code{res} and
    \code{polys} is allowed.
    The algorithm used is the Brent-Kung matrix algorithm.

void
_nmod_poly_compose_mod_brent_kung_vec_preinv_threaded(nmod_poly_struct * res,
                                             const nmod_poly_struct * polys,
                                             slong lenpolys, slong l,
                                             mp_srcptr poly, slong len,
                                             mp_srcptr polyinv, slong leninv,
                                             nmod_t mod)

    Multithreaded version of
    \code{_nmod_poly_compose_mod_brent_kung_vec_preinv}. Distributing the
    Horner evaluations across \code{flint_get_num_threads()} threads.

void
nmod_poly_compose_mod_brent_kung_vec_preinv_threaded(nmod_poly_struct * res,
                                            const nmod_poly_struct * polys,
                                            slong len1, slong n,
                                            const nmod_poly_t poly,
                                            const nmod_poly_t polyinv)

    Multithreaded version of
    \code{nmod_poly_compose_mod_brent_kung_vec_preinv}. Distributing the
    Horner evaluations across \code{flint_get_num_threads()} threads.

void _nmod_poly_compose_mod(mp_ptr res,
    mp_srcptr f, slong lenf, mp_srcptr g, mp_srcptr h, slong lenh, nmod_t mod)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that the length of $g$ is one less than the
    length of $h$ (possibly with zero padding). The output is not allowed
    to be aliased with any of the inputs.

void nmod_poly_compose_mod(nmod_poly_t res,
                    const nmod_poly_t f, const nmod_poly_t g,
                    const nmod_poly_t h)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero.


*******************************************************************************

    Greatest common divisor

*******************************************************************************

slong _nmod_poly_gcd_euclidean(mp_ptr G,
                  mp_srcptr A, slong lenA, mp_srcptr B, slong lenB, nmod_t mod)

    Computes the GCD of $A$ of length \code{lenA} and $B$ of length
    \code{lenB}, where \code{lenA >= lenB > 0}. The length of the GCD $G$
    is returned by the function. No attempt is made to make the GCD monic. It
    is required that $G$ have space for \code{lenB} coefficients.

void nmod_poly_gcd_euclidean(nmod_poly_t G,
                             const nmod_poly_t A, const nmod_poly_t B)

    Computes the GCD of $A$ and $B$. The GCD of zero polynomials is
    defined to be zero, whereas the GCD of the zero polynomial and some other
    polynomial $P$ is defined to be $P$. Except in the case where
    the GCD is zero, the GCD $G$ is made monic.

slong _nmod_poly_hgcd(mp_ptr *M, slong *lenM,
                     mp_ptr A, slong *lenA, mp_ptr B, slong *lenB,
                     mp_srcptr a, slong lena, mp_srcptr b, slong lenb,
                     nmod_t mod)

    Computes the HGCD of $a$ and $b$, that is, a matrix~$M$, a sign~$\sigma$
    and two polynomials $A$ and $B$ such that
    \begin{equation*}
    (A,B)^t = \sigma M^{-1} (a,b)^t.
    \end{equation*}

    Assumes that $\len(a) > \len(b) > 0$.

    Assumes that $A$ and $B$ have space of size at least $\len(a)$
    and $\len(b)$, respectively.  On exit, \code{*lenA} and \code{*lenB}
    will contain the correct lengths of $A$ and $B$.

    Assumes that \code{M[0]}, \code{M[1]}, \code{M[2]}, and \code{M[3]}
    each point to a vector of size at least $\len(a)$.

slong _nmod_poly_gcd_hgcd(mp_ptr G, mp_srcptr A, slong lenA,
                                   mp_srcptr B, slong lenB, nmod_t mod)

    Computes the monic GCD of $A$ and $B$, assuming that
    $\len(A) \geq \len(B) > 0$.

    Assumes that $G$ has space for $\len(B)$ coefficients and
    returns the length of $G$ on output.

void nmod_poly_gcd_hgcd(nmod_poly_t G,
                        const nmod_poly_t A, const nmod_poly_t B)

    Computes the monic GCD of $A$ and $B$ using the HGCD algorithm.

    As a special case, the GCD of two zero polynomials is defined to be
    the zero polynomial.

    The time complexity of the algorithm is $\mathcal{O}(n \log^2 n)$.
    For further details, see~\citep{ThullYap1990}.

slong _nmod_poly_gcd(mp_ptr G,
                  mp_srcptr A, slong lenA, mp_srcptr B, slong lenB, nmod_t mod)

    Computes the GCD of $A$ of length \code{lenA} and $B$ of length
    \code{lenB}, where \code{lenA >= lenB > 0}. The length of the GCD $G$
    is returned by the function. No attempt is made to make the GCD monic. It
    is required that $G$ have space for \code{lenB} coefficients.

void nmod_poly_gcd(nmod_poly_t G,
                             const nmod_poly_t A, const nmod_poly_t B)

    Computes the GCD of $A$ and $B$. The GCD of zero polynomials is
    defined to be zero, whereas the GCD of the zero polynomial and some other
    polynomial $P$ is defined to be $P$. Except in the case where
    the GCD is zero, the GCD $G$ is made monic.

slong _nmod_poly_xgcd_euclidean(mp_ptr G, mp_ptr S, mp_ptr T,
             mp_srcptr A, slong A_len, mp_srcptr B, slong B_len, nmod_t mod)

    Computes the GCD of $A$ and $B$ together with cofactors $S$ and $T$
    such that $S A + T B = G$.  Returns the length of $G$.

    Assumes that $\len(A) \geq \len(B) \geq 1$ and
    $(\len(A),\len(B)) \neq (1,1)$.

    No attempt is made to make the GCD monic.

    Requires that $G$ have space for $\len(B)$ coefficients.  Writes
    $\len(B)-1$ and $\len(A)-1$ coefficients to $S$ and $T$, respectively.
    Note that, in fact, $\len(S) \leq \max(\len(B) - \len(G), 1)$ and
    $\len(T) \leq \max(\len(A) - \len(G), 1)$.

    No aliasing of input and output operands is permitted.

void nmod_poly_xgcd_euclidean(nmod_poly_t G, nmod_poly_t S, nmod_poly_t T,
                                    const nmod_poly_t A, const nmod_poly_t B)

    Computes the GCD of $A$ and $B$. The GCD of zero polynomials is
    defined to be zero, whereas the GCD of the zero polynomial and some other
    polynomial $P$ is defined to be $P$. Except in the case where
    the GCD is zero, the GCD $G$ is made monic.

    Polynomials \code{S} and \code{T} are computed such that
    \code{S*A + T*B = G}. The length of \code{S} will be at most
    \code{lenB} and the length of \code{T} will be at most \code{lenA}.

slong _nmod_poly_xgcd_hgcd(mp_ptr G, mp_ptr S, mp_ptr T,
             mp_srcptr A, slong A_len, mp_srcptr B, slong B_len, nmod_t mod)

    Computes the GCD of $A$ and $B$, where $\len(A) \geq \len(B) > 0$,
    together with cofactors $S$ and $T$ such that $S A + T B = G$. Returns
    the length of $G$.

    No attempt is made to make the GCD monic.

    Requires that $G$ have space for $\len(B)$ coefficients.  Writes
    $\len(B) - 1$ and $\len(A) - 1$ coefficients to $S$ and $T$,
    respectively.  Note that, in fact, $\len(S) \leq \len(B) - \len(G)$
    and $\len(T) \leq \len(A) - \len(G)$.

    Both $S$ and $T$ must have space for at least $2$ coefficients.

    No aliasing of input and output operands is permitted.

void nmod_poly_xgcd_hgcd(nmod_poly_t G, nmod_poly_t S, nmod_poly_t T,
                         const nmod_poly_t A, const nmod_poly_t B)

    Computes the GCD of $A$ and $B$. The GCD of zero polynomials is
    defined to be zero, whereas the GCD of the zero polynomial and some other
    polynomial $P$ is defined to be $P$. Except in the case where
    the GCD is zero, the GCD $G$ is made monic.

    Polynomials \code{S} and \code{T} are computed such that
    \code{S*A + T*B = G}. The length of \code{S} will be at most
    \code{lenB} and the length of \code{T} will be at most \code{lenA}.

slong _nmod_poly_xgcd(mp_ptr G, mp_ptr S, mp_ptr T,
                     mp_srcptr A, slong lenA, mp_srcptr B, slong lenB,
                     nmod_t mod)

    Computes the GCD of $A$ and $B$, where $\len(A) \geq \len(B) > 0$,
    together with cofactors $S$ and $T$ such that $S A + T B = G$. Returns
    the length of $G$.

    No attempt is made to make the GCD monic.

    Requires that $G$ have space for $\len(B)$ coefficients.  Writes
    $\len(B) - 1$ and $\len(A) - 1$ coefficients to $S$ and $T$,
    respectively.  Note that, in fact, $\len(S) \leq \len(B) - \len(G)$
    and $\len(T) \leq \len(A) - \len(G)$.

    No aliasing of input and output operands is permitted.

void nmod_poly_xgcd(nmod_poly_t G, nmod_poly_t S, nmod_poly_t T,
                                    const nmod_poly_t A, const nmod_poly_t B)

    Computes the GCD of $A$ and $B$. The GCD of zero polynomials is
    defined to be zero, whereas the GCD of the zero polynomial and some other
    polynomial $P$ is defined to be $P$. Except in the case where
    the GCD is zero, the GCD $G$ is made monic.

    The polynomials \code{S} and \code{T} are set such that
    \code{S*A + T*B = G}. The length of \code{S} will be at most
    \code{lenB} and the length of \code{T} will be at most \code{lenA}.

mp_limb_t
_nmod_poly_resultant_euclidean(mp_srcptr poly1, slong len1,
                               mp_srcptr poly2, slong len2, nmod_t mod)

    Returns the resultant of \code{(poly1, len1)} and
    \code{(poly2, len2)} using the Euclidean algorithm.

    Assumes that \code{len1 >= len2 > 0}.

    Asumes that the modulus is prime.

mp_limb_t
nmod_poly_resultant_euclidean(const nmod_poly_t f, const nmod_poly_t g)

    Computes the resultant of $f$ and $g$ using the Euclidean algorithm.

    For two non-zero polynomials $f(x) = a_m x^m + \dotsb + a_0$ and
    $g(x) = b_n x^n + \dotsb + b_0$ of degrees $m$ and $n$, the resultant
    is defined to be
    \begin{equation*}
        a_m^n b_n^m \prod_{(x, y) : f(x) = g(y) = 0} (x - y).
    \end{equation*}
    For convenience, we define the resultant to be equal to zero if either
    of the two polynomials is zero.

mp_limb_t
_nmod_poly_resultant_hgcd(mp_srcptr poly1, slong len1,
                               mp_srcptr poly2, slong len2, nmod_t mod)

    Returns the resultant of \code{(poly1, len1)} and
    \code{(poly2, len2)} using the half-gcd algorithm.

    This algorithm computes the half-gcd as per \code{_nmod_poly_gcd_hgcd()}
    but additionally updates the resultant every time a division occurs. The
    half-gcd algorithm computes the GCD recursively. Given inputs $a$ and $b$
    it lets \code{m = len(a)/2} and (recursively) performs all quotients in
    the Euclidean algorithm which do not require the low $m$ coefficients of
    $a$ and $b$.

    This performs quotients in exactly the same order as the ordinary 
    Euclidean algorithm except that the low $m$ coefficients of the polynomials
    in the remainder sequence are not computed. A correction step after hgcd 
    has been called computes these low $m$ coefficients (by matrix 
    multiplication by a transformation matrix also computed by hgcd).

    This means that from the point of view of the resultant, all but the last
    quotient performed by a recursive call to hgcd is an ordinary quotient as
    per the usual Euclidean algorithm. However, the final quotient may give
    a remainder of less than $m + 1$ coefficients, which won't be corrected
    until the hgcd correction step is performed afterwards.

    To compute the adjustments to the resultant coming from this corrected
    quotient, we save the relevant information in an \code{nmod_poly_res_t}
    struct at the time the quotient is performed so that when the correction 
    step is performed later, the adjustments to the resultant can be computed 
    at that time also.

    The only time an adjustment to the resultant is not required after a
    call to hgcd is if hgcd does nothing (the remainder may already have had
    less than $m + 1$ coefficients when hgcd was called).

    Assumes that \code{len1 >= len2 > 0}.

    Asumes that the modulus is prime.

mp_limb_t
nmod_poly_resultant_hgcd(const nmod_poly_t f, const nmod_poly_t g)

    Computes the resultant of $f$ and $g$ using the half-gcd algorithm.

    For two non-zero polynomials $f(x) = a_m x^m + \dotsb + a_0$ and
    $g(x) = b_n x^n + \dotsb + b_0$ of degrees $m$ and $n$, the resultant
    is defined to be
    \begin{equation*}
        a_m^n b_n^m \prod_{(x, y) : f(x) = g(y) = 0} (x - y).
    \end{equation*}
    For convenience, we define the resultant to be equal to zero if either
    of the two polynomials is zero.

mp_limb_t
_nmod_poly_resultant(mp_srcptr poly1, slong len1,
                     mp_srcptr poly2, slong len2, nmod_t mod)

    Returns the resultant of \code{(poly1, len1)} and
    \code{(poly2, len2)}.

    Assumes that \code{len1 >= len2 > 0}.

    Asumes that the modulus is prime.

mp_limb_t
nmod_poly_resultant(const nmod_poly_t f, const nmod_poly_t g)

    Computes the resultant of $f$ and $g$.

    For two non-zero polynomials $f(x) = a_m x^m + \dotsb + a_0$ and
    $g(x) = b_n x^n + \dotsb + b_0$ of degrees $m$ and $n$, the resultant
    is defined to be
    \begin{equation*}
        a_m^n b_n^m \prod_{(x, y) : f(x) = g(y) = 0} (x - y).
    \end{equation*}
    For convenience, we define the resultant to be equal to zero if either
    of the two polynomials is zero.

slong _nmod_poly_gcdinv(mp_ptr G, mp_ptr S,
                        mp_srcptr A, slong lenA, mp_srcptr B, slong lenB,
                        const nmod_t mod)

    Computes \code{(G, lenA)}, \code{(S, lenB-1)} such that
    $G \cong S A \pmod{B}$, returning the actual length of $G$.

    Assumes that $0 < \len(A) < \len(B)$.

void nmod_poly_gcdinv(nmod_poly_t G, nmod_poly_t S,
                      const nmod_poly_t A, const nmod_poly_t B)

    Computes polynomials $G$ and $S$, both reduced modulo~$B$,
    such that $G \cong S A \pmod{B}$, where $B$ is assumed to
    have $\len(B) \geq 2$.

    In the case that $A = 0 \pmod{B}$, returns $G = S = 0$.

int _nmod_poly_invmod(mp_ptr A, mp_srcptr B, slong lenB,
                      mp_srcptr P, slong lenP, const nmod_t mod)

    Attempts to set \code{(A, lenP-1)} to the inverse of \code{(B, lenB)}
    modulo the polynomial \code{(P, lenP)}.  Returns $1$ if \code{(B, lenB)}
    is invertible and $0$ otherwise.

    Assumes that $0 < \len(B) < \len(P)$, and hence also $\len(P) \geq 2$,
    but supports zero-padding in \code{(B, lenB)}.

    Does not support aliasing.

    Assumes that $mod$ is a prime number.

int nmod_poly_invmod(nmod_poly_t A, const nmod_poly_t B, const nmod_poly_t P)

    Attempts to set $A$ to the inverse of $B$ modulo $P$ in the polynomial
    ring $(\mathbf{Z}/p\mathbf{Z})[X]$, where we assume that $p$ is a prime
    number.

    If $\len(P) < 2$, raises an exception.

    If the greatest common divisor of $B$ and $P$ is~$1$, returns~$1$ and
    sets $A$ to the inverse of $B$.  Otherwise, returns~$0$ and the value
    of $A$ on exit is undefined.


*******************************************************************************

    Power series composition

*******************************************************************************

mp_limb_t _nmod_poly_discriminant(mp_srcptr poly, slong len, nmod_t mod)

    Return the discriminant of \code{(poly, len)}. Assumes \code{len > 1}.

mp_limb_t nmod_poly_discriminant(const nmod_poly_t f)

    Return the discriminant of $f$.
    We normalise the discriminant so that
    $\operatorname{disc}(f) = (-1)^(n(n-1)/2) \operatorname{res}(f, f') /
    \operatorname{lc}(f)^(n - m - 2)$, where \code{n = len(f)} and
    \code{m = len(f')}. Thus $\operatorname{disc}(f) =
    \operatorname{lc}(f)^(2n - 2) \prod_{i < j} (r_i - r_j)^2$, where
    $\operatorname{lc}(f)$ is the leading coefficient of $f$ and $r_i$ are the
    roots of $f$.


*******************************************************************************

    Power series composition

*******************************************************************************

void _nmod_poly_compose_series_horner(mp_ptr res,
        mp_srcptr poly1, slong len1, mp_srcptr poly2, slong len2, slong n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    Assumes that \code{len1, len2, n > 0}, that \code{len1, len2 <= n},
    and that \code{(len1-1) * (len2-1) + 1 <= n}, and that \code{res} has
    space for \code{n} coefficients. Does not support aliasing between any
    of the inputs and the output.

    This implementation uses the Horner scheme.

void nmod_poly_compose_series_horner(nmod_poly_t res,
                    const nmod_poly_t poly1, const nmod_poly_t poly2, slong n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    This implementation uses the Horner scheme.

void _nmod_poly_compose_series_brent_kung(mp_ptr res, mp_srcptr poly1,
        slong len1, mp_srcptr poly2, slong len2, slong n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    Assumes that \code{len1, len2, n > 0}, that \code{len1, len2 <= n},
    and that\\ \code{(len1-1) * (len2-1) + 1 <= n}, and that \code{res} has
    space for \code{n} coefficients. Does not support aliasing between any
    of the inputs and the output.

    This implementation uses Brent-Kung algorithm 2.1 \cite{BrentKung1978}.

void nmod_poly_compose_series_brent_kung(nmod_poly_t res,
                const nmod_poly_t poly1, const nmod_poly_t poly2, slong n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    This implementation uses Brent-Kung algorithm 2.1 \cite{BrentKung1978}.

void _nmod_poly_compose_series_divconquer(mp_ptr res,
           mp_srcptr poly1, slong len1,
	   mp_srcptr poly2, slong len2, slong N, nmod_t mod)

    Composes \code{poly1} of length $\ell_1$ with \code{poly2} of
    length $\ell_2$ modulo $x^N$ and sets \code{res} to the result,
    i.e.\ evaluates \code{poly1} at \code{poly2}.

    Writes $\min\{(\ell_1 - 1)(\ell_2 - 2) + 1, N\}$ coefficients
    to the vector \code{res}.

    The algorithm used is the divide and conquer algorithm.
    It is assumed that $0 < \ell_1$ and $0 < \ell_2 \leq N$.

    Does not support aliasing between the inputs and the output.

void nmod_poly_compose_series_divconquer(nmod_poly_t res,
    const nmod_poly_t poly1, const nmod_poly_t poly2, slong N)

    Composes \code{poly1} with \code{poly2} modulo $x^N$ and sets \code{res}
    to the result, i.e.\ evaluates \code{poly1} at \code{poly2}.

    The algorithm used is the divide and conquer algorithm.

void _nmod_poly_compose_series(mp_ptr res, mp_srcptr poly1, slong len1,
                                      mp_srcptr poly2, slong len2, slong n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    Assumes that \code{len1, len2, n > 0}, that \code{len1, len2 <= n},
    and that\\ \code{(len1-1) * (len2-1) + 1 <= n}, and that \code{res} has
    space for \code{n} coefficients. Does not support aliasing between any
    of the inputs and the output.

    This implementation automatically switches between the Horner scheme
    and Brent-Kung algorithm 2.1 depending on the size of the inputs.

void nmod_poly_compose_series(nmod_poly_t res,
                    const nmod_poly_t poly1, const nmod_poly_t poly2, slong n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    This implementation automatically switches between the Horner scheme
    and Brent-Kung algorithm 2.1 depending on the size of the inputs.

*******************************************************************************

    Power series reversion

*******************************************************************************

void _nmod_poly_revert_series_lagrange(mp_ptr Qinv, mp_srcptr Q,
        slong n, nmod_t mod)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$. The arguments must
    both have length \code{n} and may not be aliased.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses the Lagrange inversion formula.

void nmod_poly_revert_series_lagrange(nmod_poly_t Qinv,
            const nmod_poly_t Q, slong n)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses the Lagrange inversion formula.

void _nmod_poly_revert_series_lagrange_fast(mp_ptr Qinv, mp_srcptr Q,
        slong n, nmod_t mod)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$. The arguments must
    both have length \code{n} and may not be aliased.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses a reduced-complexity implementation
    of the Lagrange inversion formula.

void nmod_poly_revert_series_lagrange_fast(nmod_poly_t Qinv,
            const nmod_poly_t Q, slong n)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses a reduced-complexity implementation
    of the Lagrange inversion formula.

void _nmod_poly_revert_series_newton(mp_ptr Qinv, mp_srcptr Q,
    slong n, nmod_t mod)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$. The arguments must
    both have length \code{n} and may not be aliased.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses Newton iteration \cite{BrentKung1978}.

void nmod_poly_revert_series_newton(nmod_poly_t Qinv,
        const nmod_poly_t Q, slong n)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses Newton iteration \cite{BrentKung1978}.

void _nmod_poly_revert_series(mp_ptr Qinv, mp_srcptr Q, slong n, nmod_t mod)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$. The arguments must
    both have length \code{n} and may not be aliased.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation automatically chooses between the Lagrange
    inversion formula and Newton iteration based on the size of the
    input.

void nmod_poly_revert_series(nmod_poly_t Qinv, const nmod_poly_t Q, slong n)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation automatically chooses between the Lagrange
    inversion formula and Newton iteration based on the size of the
    input.

*******************************************************************************

    Square roots

    The series expansions for $\sqrt{h}$ and $1/\sqrt{h}$ are defined
    by means of the generalised binomial theorem
    $$h^r = (1+y)^r =
        \sum_{k=0}^{\infty} {r \choose k} y^k.$$
    It is assumed that $h$ has constant term $1$ and that the coefficients
    $2^{-k}$ exist in the coefficient ring (i.e. $2$ must be invertible).


*******************************************************************************

void _nmod_poly_invsqrt_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set the first $n$ terms of $g$ to the series expansion of $1/\sqrt{h}$.
    It is assumed that $n > 0$, that $h$ has constant term 1 and that $h$
    is zero-padded as necessary to length $n$. Aliasing is not permitted.

void nmod_poly_invsqrt_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g$ to the series expansion of $1/\sqrt{h}$ to order $O(x^n)$.
    It is assumed that $h$ has constant term 1.

void _nmod_poly_sqrt_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set the first $n$ terms of $g$ to the series expansion of $\sqrt{h}$.
    It is assumed that $n > 0$, that $h$ has constant term 1 and that $h$
    is zero-padded as necessary to length $n$. Aliasing is not permitted.

void nmod_poly_sqrt_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g$ to the series expansion of $\sqrt{h}$ to order $O(x^n)$.
    It is assumed that $h$ has constant term 1.

int _nmod_poly_sqrt(mp_ptr s, mp_srcptr p, slong n, nmod_t mod)

    If \code{(p, n)} is a perfect square, sets \code{(s, n / 2 + 1)}
    to a square root of $p$ and returns 1. Otherwise returns 0.

int nmod_poly_sqrt(nmod_poly_t s, const nmod_poly_t p)

    If $p$ is a perfect square, sets $s$ to a square root of $p$
    and returns 1. Otherwise returns 0.

*******************************************************************************

    Transcendental functions

    The elementary transcendental functions of a formal power series $h$
    are defined as

    $$\exp(h(x)) = \sum_{k=0}^{\infty} \frac{(h(x))^k}{k!}$$

    $$\log(h(x)) = \int_0^x \frac{h'(t)}{h(t)} dt$$

    $$\operatorname{atan}(h(x)) = \int_0^x\frac{h'(t)}{1+(h(t))^2} dt$$

    $$\operatorname{atanh}(h(x)) = \int_0^x\frac{h'(t)}{1-(h(t))^2} dt$$

    $$\operatorname{asin}(h(x)) = \int_0^x\frac{h'(t)}{\sqrt{1-(h(t))^2}} dt$$

    $$\operatorname{asinh}(h(x)) = \int_0^x\frac{h'(t)}{\sqrt{1+(h(t))^2}} dt$$

    The functions sin, cos, tan, etc. are defined using standard inverse
    or functional relations.

    The logarithm function assumes that $h$ has constant term $1$. All
    other functions assume that $h$ has constant term $0$.

    All functions assume that the coefficient $1/k$ or $1/k!$ exists
    for all indices $k$. When computing to order $O(x^n)$, the modulus $p$
    must therefore be a prime satisfying $p \ge n$. Further, we always
    require that $p > 2$ in order to be able to multiply by $1/2$ for
    internal purposes.

    If the input does not satisfy all these conditions, results are undefined.

    Except where otherwise noted, functions are implemented with optimal
    (up to constants) complexity $O(M(n))$, where $M(n)$ is the cost
    of polynomial multiplication.

*******************************************************************************

void _nmod_poly_log_series_monomial_ui(mp_ptr g,
            mp_limb_t c, ulong r, slong n, nmod_t mod)

    Set $g = \log(1+cx^r) + O(x^n)$. Assumes $n > 0$, $r > 0$, and that
    the coefficient is reduced by the modulus. Works efficiently in linear
    time.

void nmod_poly_log_series_monomial_ui(nmod_poly_t g,
            mp_limb_t c, ulong r, slong n)

    Set $g = \log(1+cx^r) + O(x^n)$. Works efficiently in linear time.

void _nmod_poly_log_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \log(h) + O(x^n)$. Assumes $n > 0$ and that $h$ is zero-padded
    as necessary to length $n$. Aliasing of $g$ and $h$ is allowed.

void nmod_poly_log_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \log(h) + O(x^n)$. The case $h = 1+cx^r$ is automatically
    detected and handled efficiently.

void _nmod_poly_exp_series_monomial_ui(mp_ptr g,
            mp_limb_t c, ulong r, slong n, nmod_t mod)

    Set $g = \exp(cx^r) + O(x^n)$. Assumes $n > 0$, $r > 0$, and that
    the coefficient is reduced by the modulus. Works efficiently
    in linear time.

void nmod_poly_exp_series_monomial_ui(nmod_poly_t g,
            mp_limb_t c, ulong r, slong n)

    Set $g = \exp(cx^r) + O(x^n)$. Works efficiently in linear time.

void _nmod_poly_exp_series_basecase(mp_ptr g, mp_srcptr h, slong hlen,
                                        slong n, nmod_t mod)

    Set $g = \exp(h) + O(x^n)$ using a simple $O(n^2)$ algorithm.
    Assumes $n > 0$ and $\operatorname{hlen} > 0$. Only the first
    $\operatorname{hlen}$ coefficients of $h$ will be read.
    Aliasing of $f$ and $h$ is allowed.

void nmod_poly_exp_series_basecase(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \exp(h) + O(x^n)$ using a simple $O(n^2)$ algorithm.

void _nmod_poly_exp_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \exp(h) + O(x^n)$. Assumes $n > 0$ and that $h$ is zero-padded
    as necessary to length $n$. Aliasing of $g$ and $h$ is not allowed.

    Uses Newton iteration (the version given in \cite{HanZim2004}).
    For small $n$, falls back to the basecase algorithm.

void  _nmod_poly_exp_expinv_series(mp_ptr f, mp_ptr g, mp_srcptr h,
        slong n, nmod_t mod)

    Set $f = \exp(h) + O(x^n)$ and $g = \exp(-h) + O(x^n)$, more efficiently
    for large $n$ than performing a separate inversion to obtain $g$.
    Assumes $n > 0$ and that $h$ is zero-padded
    as necessary to length $n$. Aliasing is not allowed.

    Uses Newton iteration (the version given in \cite{HanZim2004}).
    For small $n$, falls back to the basecase algorithm.

void nmod_poly_exp_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \exp(h) + O(x^n)$. The case $h = cx^r$ is automatically
    detected and handled efficiently. Otherwise this function automatically
    uses the basecase algorithm for small $n$ and Newton iteration otherwise.

void _nmod_poly_atan_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{atan}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed.

void nmod_poly_atan_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{atan}(h) + O(x^n)$.

void _nmod_poly_atanh_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{atanh}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed.

void nmod_poly_atanh_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{atanh}(h) + O(x^n)$.

void _nmod_poly_asin_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{asin}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed.

void nmod_poly_asin_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{asin}(h) + O(x^n)$.

void _nmod_poly_asinh_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{asinh}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed.

void nmod_poly_asinh_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{asinh}(h) + O(x^n)$.

void _nmod_poly_sin_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{sin}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed. The value is computed using the identity
    $\sin(x) = 2 \tan(x/2)) / (1 + \tan^2(x/2)).$

void nmod_poly_sin_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{sin}(h) + O(x^n)$.

void _nmod_poly_cos_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{cos}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed. The value is computed using the identity
    $\cos(x) = (1-\tan^2(x/2)) / (1 + \tan^2(x/2)).$

void nmod_poly_cos_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{cos}(h) + O(x^n)$.

void _nmod_poly_tan_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{tan}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    not allowed. Uses Newton iteration to invert the atan function.

void nmod_poly_tan_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{tan}(h) + O(x^n)$.

void _nmod_poly_sinh_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{sinh}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    not allowed. Uses the identity $\sinh(x) = (e^x - e^{-x})/2$.

void nmod_poly_sinh_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{sinh}(h) + O(x^n)$.

void _nmod_poly_cosh_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{cos}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    not allowed. Uses the identity $\cosh(x) = (e^x + e^{-x})/2$.

void nmod_poly_cosh_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{cosh}(h) + O(x^n)$.

void _nmod_poly_tanh_series(mp_ptr g, mp_srcptr h, slong n, nmod_t mod)

    Set $g = \operatorname{tanh}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Uses the identity
    $\tanh(x) = (e^{2x}-1)/(e^{2x}+1)$.

void nmod_poly_tanh_series(nmod_poly_t g, const nmod_poly_t h, slong n)

    Set $g = \operatorname{tanh}(h) + O(x^n)$.

*******************************************************************************

    Products

*******************************************************************************

void _nmod_poly_product_roots_nmod_vec(mp_ptr poly, mp_srcptr xs,
    slong n, nmod_t mod)

    Sets \code{(poly, n + 1)} to the monic polynomial which is the product
    of $(x - x_0)(x - x_1) \cdots (x - x_{n-1})$, the roots $x_i$ being
    given by \code{xs}.

    Aliasing of the input and output is not allowed.

void nmod_poly_product_roots_nmod_vec(nmod_poly_t poly, mp_srcptr xs, slong n)

    Sets \code{poly} to the monic polynomial which is the product
    of $(x - x_0)(x - x_1) \cdots (x - x_{n-1})$, the roots $x_i$ being
    given by \code{xs}.

*******************************************************************************

    Subproduct trees

*******************************************************************************

mp_ptr * _nmod_poly_tree_alloc(slong len)

    Allocates space for a subproduct tree of the given length, having
    linear factors at the lowest level.

    Entry $i$ in the tree is a pointer to a single array of limbs,
    capable of storing $\lfloor n / 2^i \rfloor$ subproducts of
    degree $2^i$ adjacently, plus a trailing entry if $n / 2^i$ is
    not an integer.

    For example, a tree of length 7 built from monic linear factors has
    the following structure, where spaces have been inserted
    for illustrative purposes:

    \begin{verbatim}
       X1 X1 X1 X1 X1 X1 X1
       XX1   XX1   XX1   X1
       XXXX1       XX1   X1
       XXXXXXX1
    \end{verbatim}

void _nmod_poly_tree_free(mp_ptr * tree, slong len)

    Free the allocated space for the subproduct.

void _nmod_poly_tree_build(mp_ptr * tree, mp_srcptr roots, slong len,
    nmod_t mod)

    Builds a subproduct tree in the preallocated space from
    the \code{len} monic linear factors $(x-r_i)$. The top level
    product is not computed.


*******************************************************************************

    Inflation and deflation

*******************************************************************************

void nmod_poly_inflate(nmod_poly_t result, const nmod_poly_t input,
    ulong inflation)

    Sets \code{result} to the inflated polynomial $p(x^n)$ where
    $p$ is given by \code{input} and $n$ is given by \code{deflation}.

void nmod_poly_deflate(nmod_poly_t result, const nmod_poly_t input,
    ulong deflation)

    Sets \code{result} to the deflated polynomial $p(x^{1/n})$ where
    $p$ is given by \code{input} and $n$ is given by \code{deflation}.
    Requires $n > 0$.

ulong nmod_poly_deflation(const nmod_poly_t input)

    Returns the largest integer by which \code{input} can be deflated.
    As special cases, returns 0 if \code{input} is the zero polynomial
    and 1 of \code{input} is a constant polynomial.
