Source code for moments.Spectrum_mod

"""
Contains Spectrum object, which represents frequency spectra.
"""
import logging

logging.basicConfig()
logger = logging.getLogger("Spectrum_mod")
import functools
import operator
import os
import sys
import numpy as np
from numpy import newaxis as nuax
import scipy.misc as misc
import scipy.stats
import copy
import itertools
import warnings
import demes

# Account for difference in scipy installations.
try:
    from scipy.misc import comb
except ImportError:
    try:
        from scipy.special import comb
    except:
        from scipy import comb
from scipy.integrate import trapz
from scipy.special import betainc

import moments.Integration
import moments.Integration_nomig
from . import Numerics
import moments.Demes as Demes

_plotting = True
try:
    import moments.ModelPlot
except ImportError:
    # if matplotlib is not present, do not import, and do not run plotting functions
    _plotting = False


[docs]class Spectrum(np.ma.masked_array):
    """
    Represents a single-locus biallelic frequency spectrum.

    Spectra are represented by masked arrays. The masking allows us to ignore
    specific entries in the spectrum. When simulating under the standard infinite
    sites model (ISM), the entries we mask are the bins specifying absent or fixed
    variants. When using a reversible mutation model (i.e. the finite genome model),
    we track the density of variants in fixed bins, setting ``mask_corners`` to
    ``False``.

    :param data: An array with dimension equal to the number of populations.
        Each dimension has length :math:`n_i+1`, where :math:`n_i` is the
        sample size for the i-th population.
    :type data: array
    :param mask: An optional array of the same size as data. 'True' entries in
        this array are masked in the Spectrum. These represent missing
        data categories. (For example, you may not trust your singleton
        SNP calling.)
    :type maks: array, optional
    :param mask_corners: If True (default), the 'observed in none' and 'observed
        in all' entries of the FS will be masked. Typically these
        entries are masked. In the defaul infinite sites model, moments does
        not reliably calculate the fixed-bin entries, so you will almost always
        want ``mask_corners=True``. The exception is if we are simulating under
        the finite genome model, in which case we track the probability of
        a site to be fixed for either allele.
    :type maks_corners: bool, optional
    :param data_folded: If True, it is assumed that the input data is folded. An
        error will be raised if the input data and mask are not
        consistent with a folded Spectrum.
    :type data_folded: bool, optional
    :param check_folding: If True and data_folded=True, the data and mask will be
        checked to ensure they are consistent with a folded
        Spectrum. If they are not, a warning will be printed.
    :type check_folding: bool, optional
    :param pop_ids: Optional list of strings containing the population labels,
        with length equal to the dimension of ``data``.
    :type pop_ids: list of strings, optional

    :return: A frequency spectrum object, as a masked array.
    """

    def __new__(
        subtype,
        data,
        mask=np.ma.nomask,
        mask_corners=True,
        data_folded=None,
        check_folding=True,
        dtype=float,
        copy=True,
        fill_value=np.nan,
        keep_mask=True,
        shrink=True,
        pop_ids=None,
    ):
        data = np.asanyarray(data)

        if mask is np.ma.nomask:
            mask = np.ma.make_mask_none(data.shape)
        else:
            # Since a mask is given, we do not mask the corners
            mask_corners = False

        subarr = np.ma.masked_array(
            data,
            mask=mask,
            dtype=dtype,
            copy=copy,
            fill_value=np.nan,
            keep_mask=True,
            shrink=True,
        )
        subarr = subarr.view(subtype)

        if hasattr(data, "folded"):
            if data_folded is None or data_folded == data.folded:
                subarr.folded = data.folded
            elif data_folded != data.folded:
                raise ValueError(
                    "Data does not have same folding status as "
                    "was called for in Spectrum constructor."
                )
        elif data_folded is not None:
            subarr.folded = data_folded
        else:
            subarr.folded = False

        # Check that if we're declaring that the input data is folded, it
        # actually is, and the mask reflects this.
        if data_folded:
            total_samples = np.sum(subarr.sample_sizes)
            total_per_entry = subarr._total_per_entry()
            # Which entries are nonsense in the folded fs.
            where_folded_out = total_per_entry > int(total_samples / 2)
            if check_folding and not np.all(subarr.data[where_folded_out] == 0):
                logger.warn(
                    "Creating Spectrum with data_folded = True, but "
                    "data has non-zero values in entries which are "
                    "nonsensical for a folded Spectrum."
                )
            if check_folding and not np.all(subarr.mask[where_folded_out]):
                logger.warn(
                    "Creating Spectrum with data_folded = True, but "
                    "mask is not True for all entries which are "
                    "nonsensical for a folded Spectrum."
                )

        if hasattr(data, "pop_ids"):
            if pop_ids is None or pop_ids == data.pop_ids:
                subarr.pop_ids = data.pop_ids
            elif pop_ids != data.pop_ids:
                logger.warn(
                    "Changing population labels in construction of new " "Spectrum."
                )
                if len(pop_ids) != subarr.ndim:
                    raise ValueError(
                        "pop_ids must be of length equal to "
                        "dimensionality of Spectrum."
                    )
                subarr.pop_ids = pop_ids
        else:
            if pop_ids is not None and len(pop_ids) != subarr.ndim:
                raise ValueError(
                    "pop_ids must be of length equal to " "dimensionality of Spectrum."
                )
            subarr.pop_ids = pop_ids

        if mask_corners:
            subarr.mask_corners()

        return subarr

    # See http://www.scipy.org/Subclasses for information on the
    # __array_finalize__ and __array_wrap__ methods. I had to do some debugging
    # myself to discover that I also needed _update_from.
    # Also, see http://docs.scipy.org/doc/np/reference/arrays.classes.html
    # Also, see http://docs.scipy.org/doc/np/user/basics.subclassing.html
    #
    # We need these methods to ensure extra attributes get copied along when
    # we do arithmetic on the FS.
    def __array_finalize__(self, obj):
        if obj is None:
            return
        np.ma.masked_array.__array_finalize__(self, obj)
        self.folded = getattr(obj, "folded", "unspecified")
        self.pop_ids = getattr(obj, "pop_ids", None)

    def __array_wrap__(self, obj, context=None):
        result = obj.view(type(self))
        result = np.ma.masked_array.__array_wrap__(self, obj, context=context)
        result.folded = self.folded
        result.pop_ids = self.pop_ids
        return result

    def _update_from(self, obj):
        np.ma.masked_array._update_from(self, obj)
        if hasattr(obj, "folded"):
            self.folded = obj.folded
        if hasattr(obj, "pop_ids"):
            self.pop_ids = obj.pop_ids

    # masked_array has priority 20.
    __array_priority__ = 20

    def __repr__(self):
        return "Spectrum(%s, folded=%s, pop_ids=%s)" % (
            str(self),
            str(self.folded),
            str(self.pop_ids),
        )

    # Functions for manipulating frequency spectra.
[docs]    def mask_corners(self):
        """
        Mask the 'seen in 0 samples' and 'seen in all samples' entries.
        """
        self.mask.flat[0] = self.mask.flat[-1] = True

[docs]    def unmask_all(self):
        """
        Unmask all entires of the frequency spectrum.
        """
        self.mask[tuple([slice(None)] * self.Npop)] = False

    def _get_sample_sizes(self):
        return np.asarray(self.shape) - 1

    sample_sizes = property(_get_sample_sizes)

    def _get_Npop(self):
        return self.ndim

    Npop = property(_get_Npop)

    def _ensure_dimension(self, Npop):
        """
        Ensure that fs has Npop dimensions.
        """
        if not self.Npop == Npop:
            raise ValueError("Only compatible with %id spectra." % Npop)

[docs]    def project(self, ns):
        """
        Project to smaller sample size.

        ``project`` does *not* act in-place, so that the input frequency
        spectrum is not changed.

        :param ns: Sample sizes for new spectrum.
        :type ns: list of integers
        """
        if len(ns) != self.Npop:
            raise ValueError(
                "Requested sample sizes not of same dimension "
                "as spectrum. Perhaps you need to marginalize "
                "over some populations first?"
            )
        if np.any(np.asarray(ns) > np.asarray(self.sample_sizes)):
            raise ValueError(
                "Cannot project to a sample size greater than "
                "original. Original size is %s and requested size "
                "is %s." % (self.sample_sizes, ns)
            )

        original_folded = self.folded
        # If we started with an folded Spectrum, we need to unfold before
        # projecting.
        if original_folded:
            output = self.unfold()
        else:
            output = self.copy()

        # Iterate over each axis, applying the projection.
        for axis, proj in enumerate(ns):
            if proj != self.sample_sizes[axis]:
                output = output._project_one_axis(proj, axis)

        output.pop_ids = self.pop_ids

        # Return folded or unfolded as original.
        if original_folded:
            return output.fold()
        else:
            return output

    def _project_one_axis(self, n, axis=0):
        """
        Project along a single axis.
        """
        # This gets a little tricky with fancy indexing to make it work
        # for fs with arbitrary number of dimensions.
        if n > self.sample_sizes[axis]:
            raise ValueError(
                "Cannot project to a sample size greater than "
                "original. Called sizes were from %s to %s."
                % (self.sample_sizes[axis], n)
            )

        newshape = list(self.shape)
        newshape[axis] = n + 1
        # Create a new empty fs that we'll fill in below.
        pfs = Spectrum(np.zeros(newshape), mask_corners=False)

        # Set up for our fancy indexes. These slices are currently like
        # [:,:,...]
        from_slice = [slice(None) for ii in range(self.Npop)]
        to_slice = [slice(None) for ii in range(self.Npop)]
        proj_slice = [nuax for ii in range(self.Npop)]

        proj_from = self.sample_sizes[axis]
        # For each possible number of hits.
        for hits in range(proj_from + 1):
            # Adjust the slice in the array we're projecting from.
            from_slice[axis] = slice(hits, hits + 1)
            # These are the least and most possible hits we could have in the
            #  projected fs.
            least, most = max(n - (proj_from - hits), 0), min(hits, n)
            to_slice[axis] = slice(least, most + 1)
            # The projection weights.
            proj = Numerics._cached_projection(n, proj_from, hits)
            proj_slice[axis] = slice(least, most + 1)
            # Do the multiplications
            pfs.data[tuple(to_slice)] += (
                self.data[tuple(from_slice)] * proj[tuple(proj_slice)]
            )
            pfs.mask[tuple(to_slice)] = np.logical_or(
                pfs.mask[tuple(to_slice)], self.mask[tuple(from_slice)]
            )

        return pfs

[docs]    def marginalize(self, over, mask_corners=None):
        """
        Reduced dimensionality spectrum summing over the set of populations
        given by ``over``.

        ``marginalize`` does not act in-place, so the input frequency spectrum
        will not be altered.

        :param over: List of axes to sum over. For example (0,2) will marginalize
            populations 0 and 2.
        :type over: list of integers
        :param mask_corners: If True, the fixed bins of the resulting spectrum will be
            masked. The default behavior is to mask the corners only if at least one
            of the corners of the input frequency spectrum is masked. If either
            corner is masked, the output frequency spectrum masks the fixed bins.
        :type mask_corners: bool, optional
        """
        if _plotting:
            # Update ModelPlot
            model = moments.ModelPlot._get_model()
            if model is not None:
                model.extinction(over)

        original_folded = self.folded
        # If we started with an folded Spectrum, we need to unfold before
        # marginalizing.
        if original_folded:
            output = self.unfold()
        else:
            output = self.copy()

        orig_mask = output.mask.copy()
        orig_mask.flat[0] = orig_mask.flat[-1] = False
        if np.any(orig_mask):
            logger.warn(
                "Marginalizing a Spectrum with internal masked values. "
                "This may not be a well-defined operation."
            )

        # Do the marginalization
        for axis in sorted(over)[::-1]:
            output = output.sum(axis=axis)
        pop_ids = None
        if self.pop_ids is not None:
            pop_ids = list(self.pop_ids)
            for axis in sorted(over)[::-1]:
                del pop_ids[axis]
        output.folded = False
        output.pop_ids = pop_ids

        if mask_corners is None:
            if self.mask.flat[0] == True or self.mask.flat[-1] == True:
                mask_corners = True
            else:
                mask_corners = False
        if mask_corners:
            output.mask_corners()

        # Return folded or unfolded as original.
        if original_folded:
            return output.fold()
        else:
            return output

[docs]    def swap_axes(self, ax1, ax2):
        """
        Uses np's swapaxes function, but also swaps pop_ids as appropriate
        if pop_ids are given.

        .. note::
            ``fs.swapaxes(ax1, ax2)`` will still work, but if population
            ids are given, it won't swap the pop_ids entries as expected.

        :param ax1: The index of the first population to swap.
        :type ax1: int
        :param ax2: The index of the second population to swap.
        :type ax2: int
        """
        output = np.swapaxes(self, ax1, ax2)
        if output.pop_ids is not None:
            pop1, pop2 = output.pop_ids[ax1], output.pop_ids[ax2]
            output.pop_ids[ax1], output.pop_ids[ax2] = pop2, pop1
        return output

    def _counts_per_entry(self):
        """
        Counts per population for each entry in the fs.
        """
        ind = np.indices(self.shape)
        # Transpose the first access to the last, so ind[ii,jj,kk] = [ii,jj,kk]
        ind = ind.transpose(list(range(1, self.Npop + 1)) + [0])
        return ind

    def _total_per_entry(self):
        """
        Total derived alleles for each entry in the fs.
        """
        return np.sum(self._counts_per_entry(), axis=-1)

[docs]    def log(self):
        """
        Returns the natural logarithm of the entries of the frequency spectrum.

        Only necessary because np.ma.log now fails to propagate extra
        attributes after np 1.10.
        """
        logfs = np.ma.log(self)
        logfs.folded = self.folded
        logfs.pop_ids = self.pop_ids
        return logfs

[docs]    def fold(self):
        """
        Returns a folded frequency spectrum.

        The folded fs assumes that information on which allele is ancestral or
        derived is unavailable. Thus the fs is in terms of minor allele
        frequency.  This makes the fs into a "triangular" array. If a masked cell
        is folded into non-masked cell, the destination cell is masked as well.

        Folding is not done in-place. The return value is a new Spectrum object.
        """
        if self.folded:
            raise ValueError("Input Spectrum is already folded.")

        # How many samples total do we have? The folded fs can only contain
        # entries up to total_samples/2 (rounded down).
        total_samples = np.sum(self.sample_sizes)

        total_per_entry = self._total_per_entry()

        # Here's where we calculate which entries are nonsense in the folded fs.
        where_folded_out = total_per_entry > int(total_samples / 2)

        original_mask = self.mask
        # Here we create a mask that masks any values that were masked in
        # the original fs (or folded onto by a masked value).
        final_mask = np.logical_or(original_mask, Numerics.reverse_array(original_mask))

        # To do the actual folding, we take those entries that would be folded
        # out, reverse the array along all axes, and add them back to the
        # original fs.
        reversed = Numerics.reverse_array(np.where(where_folded_out, self, 0))
        folded = np.ma.masked_array(self.data + reversed)
        folded.data[where_folded_out] = 0

        # Deal with those entries where assignment of the minor allele is
        # ambiguous.
        where_ambiguous = total_per_entry == total_samples / 2.0
        ambiguous = np.where(where_ambiguous, self, 0)
        folded += -0.5 * ambiguous + 0.5 * Numerics.reverse_array(ambiguous)

        # Mask out the remains of the folding operation.
        final_mask = np.logical_or(final_mask, where_folded_out)

        outfs = Spectrum(
            folded, mask=final_mask, data_folded=True, pop_ids=self.pop_ids
        )
        return outfs

[docs]    def unfold(self):
        """
        Returns an unfolded frequency spectrum.

        It is assumed that each state of a SNP is equally likely to be
        ancestral.

        Unfolding is not done in-place. The return value is a new Spectrum object.
        """
        if not self.folded:
            raise ValueError("Input Spectrum is not folded.")

        # Unfolding the data is easy.
        reversed_data = Numerics.reverse_array(self.data)
        newdata = (self.data + reversed_data) / 2.0

        # Unfolding the mask is trickier. We want to preserve masking of entries
        # that were masked in the original Spectrum.
        # Which entries in the original Spectrum were masked solely because
        # they are incompatible with a folded Spectrum?
        total_samples = np.sum(self.sample_sizes)
        total_per_entry = self._total_per_entry()
        where_folded_out = total_per_entry > int(total_samples / 2)

        newmask = np.logical_xor(self.mask, where_folded_out)
        newmask = np.logical_or(newmask, Numerics.reverse_array(newmask))

        outfs = Spectrum(newdata, mask=newmask, data_folded=False, pop_ids=self.pop_ids)
        return outfs

    # Functions that apply demographic events, including integration.
[docs]    def split(self, idx, n0, n1, new_ids=None):
        """
        Splits a population in the SFS into two populations, with the extra
        population placed at the end. Returns a new frequency spectrum.

        :param idx: The index of the population to split.
        :type idx: int
        :param n0: The sample size of the first split population.
        :type n0: int
        :param n1: The sample size of the second split population.
        :type n1: int
        :param new_ids: The population IDs of the split populations. Can only be
            used if pop_ids are given for the input spectrum.
        :type new_ids: list of strings, optional
        """
        if self.folded:
            raise ValueError("Cannot perform split on folded spectrum.")
        if self.pop_ids is None and new_ids is not None:
            raise ValueError("Trying to assign ids to a SFS with no pop_ids.")
        if new_ids is not None and len(new_ids) != 2:
            raise ValueError("new_ids must be a list of two population id strings")
        fs_split = moments.Manips.split_by_index(self, idx, n0, n1)
        if new_ids is not None:
            fs_split.pop_ids = self.pop_ids
            fs_split.pop_ids[idx] = new_ids[0]
            fs_split.pop_ids.append(new_ids[1])
        return fs_split

[docs]    def branch(self, idx, n, new_id=None):
        """
        A "branch" event, where a population gives rise to a child population, while
        persisting. This is conceptually similar to the split event. The number of
        lineages in the new population is provided, and the number of lineages in the
        source/parental population is the original sample size minus the number
        requested for the branched population. Returns a new frequency spectrum.

        :param idx: The index of the population to branch.
        :type idx: int
        :param n: The sample size of the new population.
        :type n: int
        :param new_id: The population ID of the branch populations. The parental
            population retains its original population ID. Can only be
            used if pop_ids are given for the input spectrum.
        :type new_ids: str, optional
        """
        if self.folded:
            raise ValueError("Cannot perform branch on folded spectrum.")
        if self.pop_ids is None and new_id is not None:
            raise ValueError("Trying to assign ids to a SFS with no pop_ids.")
        if idx < 0 or idx >= self.Npop:
            raise ValueError(
                f"Cannot branch population index {idx} in SFS of size {self.Npop}"
            )
        n_parent = self.sample_sizes[idx]
        n0 = n_parent - n
        fs_branch = moments.Manips.split_by_index(self, idx, n0, n)
        if new_id is not None:
            fs_branch.pop_ids = self.pop_ids
            fs_branch.pop_ids.append(new_id)
        return fs_branch

[docs]    def admix(self, idx0, idx1, num_lineages, proportion, new_id=None):
        """
        Returns a new frequency spectrum with an admixed population that arose through
        admixture from indexed populations with given number of lineages and
        proportions from parental populations. This serves as a wrapper for
        ``Manips.admix_into_new``, with the added feature of handling pop_ids.

        If the number of lineages that move are equal to the number
        of lineages previously present in a source population, that source
        population is marginalized.

        :param idx0: Index of first source population.
        :type idx0: int
        :param idx1: Index of second source population.
        :type idx1: int
        :param num_lineages: Number of lineages in the new population. Cannot be
            greater than the number of existing lineages in either source
            populations.
        :type num_lineages: int
        :param proportion: The proportion of lineages that come from the first
            source population (1-proportion acestry comes from the second source
            population). Must be a number between 0 and 1.
        :type proportion: float
        :param new_id: The ID of the new population. Can only be used if the
            population IDs are specified in the input SFS.
        :type new_id: str, optional
        """
        if new_id is not None and self.pop_ids is None:
            raise ValueError("Cannot specify new pop ids if input SFS has no pop_ids")
        if proportion < 0 or proportion > 1:
            raise ValueError("proportion must be between 0 and 1")
        if idx0 == idx1:
            raise ValueError("Cannot admix population with itself")
        if idx0 < 0 or idx0 >= self.Npop or idx1 < 0 or idx1 >= self.Npop:
            raise ValueError(f"Population indexes must be between 0 and {self.Npop-1}")
        fs_admix = moments.Manips.admix_into_new(
            self, idx0, idx1, num_lineages, proportion
        )
        if new_id is not None:
            new_pop_ids = copy.copy(self.pop_ids)
            # remove pop ids for marginalized pops
            for idx in sorted([idx0, idx1])[::-1]:
                if self.sample_sizes[idx] == num_lineages:
                    del new_pop_ids[idx]
            new_pop_ids.append(new_id)
            fs_admix.pop_ids = new_pop_ids
        return fs_admix

[docs]    def pulse_migrate(self, idx_from, idx_to, keep_from, proportion):
        """
        Mass migration (pulse admixture) between two existing populations. The
        target (destination) population has the same number of lineages in the
        output SFS, and the source population has ``keep_from`` number of lineages
        after the pulse event. The proportion is the expected ancestry proportion
        in the target population that comes from the source population.

        This serves as a wrapper for ``Manips.admix_inplace``.

        Depending on the proportion and number of lineages, because this is an
        approximate operation, we often need a large number of lineages from
        the source population to maintain accuracy.

        :param idx_from: Index of source population.
        :type idx_from: int
        :param idx_to: Index of targeet population.
        :type idx_to: int
        :param keep_from: Number of lineages to keep in source population.
        :type keep_from: int
        :param proportion: Ancestry proportion of source population that migrates
            to target population.
        :type proportion: float
        """
        if idx_from < 0 or idx_from >= self.Npop or idx_to < 0 or idx_to >= self.Npop:
            raise ValueError(f"Invalid population index for {self.Npop}D SFS.")
        if proportion < 0 or proportion > 1:
            raise ValueError("proportion must be between 0 and 1")
        if idx_from == idx_to:
            raise ValueError("Cannot admix population into itself")
        fs_pulse = moments.Manips.admix_inplace(
            self, idx_from, idx_to, keep_from, proportion
        )
        if self.pop_ids is not None:
            fs_pulse.pop_ids = self.pop_ids
        return fs_pulse

    # Integrate the SFS in-place
[docs]    def integrate(
        self,
        Npop,
        tf,
        dt_fac=0.02,
        gamma=None,
        h=None,
        m=None,
        theta=1.0,
        adapt_dt=False,
        finite_genome=False,
        theta_fd=None,
        theta_bd=None,
        frozen=[False],
    ):
        """
        Method to simulate the spectrum's evolution for a given set of demographic
        parameters. The SFS is integrated forward-in-time, and the integration
        occurs in-place, meaning you need only call ``fs.integrate( )``, and the
        ``fs`` is updated.

        :param Npop: List of populations' relative effective sizes. Can be given
            as a list of positive values for constant sizes, or as a function that
            returns a list of sizes at a given time.
        :type Npop: list or function that returns a list
        :param tf: The total integration time in genetic units.
        :type tf: float
        :param dt_fac: The timestep factor, default is 0.02. This parameter typically
            does not need to be adjusted.
        :type dt_fac: float, optional
        :param gamma: The selection coefficient (:math:`2 N_e s`), or list of selection
            coefficients if more than one population.
        :type gamma: float or list of floats, optional
        :param h: The dominance coefficient, or list of dominance coefficients in
            each population, if more than one population.
        :type h: float or list of floats, optional
        :param m: The migration rates matrix as a 2-D array with shape nxn,
            where n is the number of populations. The entry of the migration
            matrix m[i,j] is the migration rate from pop j to pop i in genetic
            units, that is, normalized by :math:`2N_e`. `m` may be either a
            2-D array, or a function that returns a 2-D array (with dimensions
            equal to (num pops)x(num pops)).
        :type m: array-like, optional
        :param theta: The scaled mutation rate :math:`4 N_e u`, which defaults to 1.
            ``theta`` can be used in the reversible model in the case of symmetric
            mutation rates. In this case, ``theta`` must be set to << 1.
        :type theta: float, optional
        :param adapt_dt: flag to allow dt correction avoiding negative entries.
        :type adapt_dt: bool, optional
        :param finite_genome: If True, simulate under the finite-genome model with
            reversible mutations. If using this model, we can specify the forward
            and backward mutation rates, which are per-base rates that are not
            scaled by number of mutable loci. If ``theta_fd`` and ``theta_bd``
            are not specified, we assume equal forward and backward mutation rates
            provided by ``theta``, which must be set to less that 1.
            Defaults to False.
        :type finite_genome: bool, optional
        :param theta_fd: The forward mutation rate :math:`4 Ne u`.
        :type theta_fd: float, optional
        :param theta_bd: The backward mutation rate :math:`4 Ne v`.
        :type theta_bd: float, optional
        :param frozen: Specifies the populations that are "frozen", meaning
            samples from that population no longer change due or contribute
            to migration to other populations. This feature is most often
            used to indicate ancient samples, for example, ancient DNA.
            The ``frozen`` parameter is given as a list of same length
            as number of pops, with ``True`` for frozen
            populations at the corresponding index, and ``False`` for
            populations that continue to evolve.
        :type frozen: list of bools
        """

        if tf < 0:
            raise ValueError("Integration time cannot be negative")

        n = np.array(self.shape) - 1

        if m is not None and not callable(m):
            m = np.array(m)

        if finite_genome == True and (theta_fd == None or theta_bd == None):
            if theta >= 1:
                raise ValueError(
                    "In the finite genome model, theta must be much less than 1. "
                    "If symmetric mutation rates, can use theta << 1. Otherwise, "
                    "theta_fd and theta_bd must be specified."
                )
            else:
                theta_fd = theta_bd = theta

        if hasattr(Npop, "__len__"):
            if np.any(frozen) and len(Npop) != len(frozen):
                raise ValueError(
                    "If one or more populations are frozen, length "
                    "of frozen must match number of simulated pops."
                )
        else:
            if np.any(frozen) and len(Npop(0)) != len(frozen):
                raise ValueError(
                    "If one or more populations are frozen, length "
                    "of frozen must match number of simulated pops."
                )

        if _plotting:
            model = moments.ModelPlot._get_model()
            if model is not None:
                model.evolve(tf, Npop, m)

        if len(n) == 1:
            if gamma is None:
                gamma = 0.0
            if h is None:
                h = 0.5
            if gamma == 0:
                self.data[:] = moments.Integration_nomig.integrate_neutral(
                    self.data,
                    Npop,
                    tf,
                    dt_fac,
                    theta,
                    finite_genome=finite_genome,
                    theta_fd=theta_fd,
                    theta_bd=theta_bd,
                    frozen=frozen,
                )
            else:
                self.data[:] = moments.Integration_nomig.integrate_nomig(
                    self.data,
                    Npop,
                    tf,
                    dt_fac,
                    gamma,
                    h,
                    theta,
                    finite_genome=finite_genome,
                    theta_fd=theta_fd,
                    theta_bd=theta_bd,
                    frozen=frozen,
                )
        else:
            if gamma is None:
                gamma = np.zeros(len(n))
            elif not hasattr(gamma, "__len__"):
                gamma = gamma * np.ones(len(n))
            if h is None:
                h = 0.5 * np.ones(len(n))
            elif not hasattr(h, "__len__"):
                h = h * np.ones(len(n))
            if m is None:
                m = np.zeros([len(n), len(n)])
            if not callable(m) and (m == 0).all():
                # for more than 2 populations, the sparse solver
                # seems to be faster than the tridiag...
                if (np.array(gamma) == 0).all() and len(n) < 3:
                    self.data[:] = moments.Integration_nomig.integrate_neutral(
                        self.data,
                        Npop,
                        tf,
                        dt_fac,
                        theta,
                        finite_genome=finite_genome,
                        theta_fd=theta_fd,
                        theta_bd=theta_bd,
                        frozen=frozen,
                    )
                else:
                    self.data[:] = moments.Integration_nomig.integrate_nomig(
                        self.data,
                        Npop,
                        tf,
                        dt_fac,
                        gamma,
                        h,
                        theta,
                        finite_genome=finite_genome,
                        theta_fd=theta_fd,
                        theta_bd=theta_bd,
                        frozen=frozen,
                    )
            else:
                self.data[:] = moments.Integration.integrate_nD(
                    self.data,
                    Npop,
                    tf,
                    dt_fac,
                    gamma,
                    h,
                    m,
                    theta,
                    adapt_dt,
                    finite_genome=finite_genome,
                    theta_fd=theta_fd,
                    theta_bd=theta_bd,
                    frozen=frozen,
                )

    # Functions for computing statistics from frequency spetra.

[docs]    def Fst(self, pairwise=False):
        """
        Wright's Fst between the populations represented in the fs.

        This estimate of Fst assumes random mating, because we don't have
        heterozygote frequencies in the fs.

        Calculation is by the method of Weir and Cockerham *Evolution* 38:1358
        (1984). For a single SNP, the relevant formula is at the top of page
        1363. To combine results between SNPs, we use the weighted average
        indicated by equation 10.

        :param pairwise: Defaults to False. If True, returns a dictionary
            of all pairwise Fst within the multi-dimensional spectrum.
        :type pairwise: bool
        """
        if pairwise:
            Fsts = {}
            for pair in itertools.combinations(range(self.ndim), 2):
                to_marginalize = list(range(self.ndim))
                to_marginalize.pop(to_marginalize.index(pair[0]))
                to_marginalize.pop(to_marginalize.index(pair[1]))
                fs_marg = self.marginalize(to_marginalize)
                if self.pop_ids is not None:
                    pair_key = (self.pop_ids[pair[0]], self.pop_ids[pair[1]])
                else:
                    pair_key = pair
                Fsts[pair_key] = fs_marg.Fst()
            return Fsts
        else:
            # This gets a little obscure because we want to be able to work with
            # spectra of arbitrary dimension.

            # First quantities from page 1360
            r = self.Npop
            ns = self.sample_sizes
            nbar = np.mean(ns)
            nsum = np.sum(ns)
            nc = (nsum - np.sum(ns ** 2) / nsum) / (r - 1)

            # counts_per_pop is an r+1 dimensional array, where the last axis simply
            # records the indices of the entry.
            # For example, counts_per_pop[4,19,8] = [4,19,8]
            counts_per_pop = np.indices(self.shape)
            counts_per_pop = np.transpose(
                counts_per_pop, axes=list(range(1, r + 1)) + [0]
            )

            # The last axis of ptwiddle is now the relative frequency of SNPs in
            # that bin in each of the populations.
            ptwiddle = 1.0 * counts_per_pop / ns

            # Note that pbar is of the same shape as fs...
            pbar = np.sum(ns * ptwiddle, axis=-1) / nsum

            # We need to use 'this_slice' to get the proper aligment between
            # ptwiddle and pbar.
            this_slice = [slice(None)] * r + [np.newaxis]
            s2 = np.sum(ns * (ptwiddle - pbar[tuple(this_slice)]) ** 2, axis=-1) / (
                (r - 1) * nbar
            )

            # Note that this 'a' differs from equation 2, because we've used
            # equation 3 and b = 0 to solve for hbar.
            a = (
                nbar
                / nc
                * (s2 - 1 / (2 * nbar - 1) * (pbar * (1 - pbar) - (r - 1) / r * s2))
            )
            d = 2 * nbar / (2 * nbar - 1) * (pbar * (1 - pbar) - (r - 1) / r * s2)

            # The weighted sum over loci.
            asum = (self * a).sum()
            dsum = (self * d).sum()

            return asum / (asum + dsum)

[docs]    def S(self):
        """
        Returns the number of segregating sites in the frequency spectrum.
        """
        oldmask = self.mask.copy()
        self.mask_corners()
        S = self.sum()
        self.mask = oldmask
        return S

[docs]    def Watterson_theta(self):
        """
        Returns Watterson's estimator of theta.

        .. note::
            This function is only sensible for 1-dimensional spectra.
        """
        if self.Npop != 1:
            raise ValueError("Only defined on a one-dimensional fs.")

        n = self.sample_sizes[0]
        S = self.S()
        an = np.sum(1.0 / np.arange(1, n))

        return S / an

[docs]    def theta_L(self):
        """
        Returns theta_L as defined by Zeng et al. "Statistical Tests for Detecting
        Positive Selection by Utilizing High-Frequency Variants" (2006)
        Genetics

        .. note::
            This function is only sensible for 1-dimensional spectra.
        """
        if self.Npop != 1:
            raise ValueError("Only defined on a one-dimensional fs.")

        n = self.sample_sizes[0]
        return np.sum(np.arange(1, n) * self[1:n]) / (n - 1)

[docs]    def Zengs_E(self):
        """
        Returns Zeng et al.'s E statistic.

        From Zeng et al., "Statistical Tests for Detecting Positive Selection by
        Utilizing High-Frequency Variants." *Genetics*, 2016.
        """
        num = self.theta_L() - self.Watterson_theta()

        n = self.sample_sizes[0]

        # See after Eq. 3
        an = np.sum(1.0 / np.arange(1, n))
        # See after Eq. 9
        bn = np.sum(1.0 / np.arange(1, n) ** 2)
        s = self.S()

        # See immediately after Eq. 12
        theta = self.Watterson_theta()
        theta_sq = s * (s - 1.0) / (an ** 2 + bn)

        # Eq. 14
        var = (n / (2.0 * (n - 1.0)) - 1.0 / an) * theta + (
            bn / an ** 2
            + 2.0 * (n / (n - 1.0)) ** 2 * bn
            - 2 * (n * bn - n + 1.0) / ((n - 1.0) * an)
            - (3.0 * n + 1.0) / (n - 1.0)
        ) * theta_sq

        return num / np.sqrt(var)

[docs]    def pi(self):
        """
        Returns the estimated expected number of pairwise differences between two
        chromosomes in the population.

        .. note::
            This estimate includes a factor of sample_size / (sample_size - 1)
            to make :math:`\\mathbb{E}[\\pi] = \\theta`.
        """
        if self.ndim != 1:
            raise ValueError("Only defined for a one-dimensional SFS.")

        n = self.sample_sizes[0]
        # sample frequencies p
        p = np.arange(0, n + 1, dtype=float) / n
        # This expression derives from Gillespie's _Population_Genetics:_A
        # _Concise_Guide_, 2nd edition, section 2.6.
        return n / (n - 1.0) * 2 * np.ma.sum(self * p * (1 - p))

[docs]    def Tajima_D(self):
        """
        Returns Tajima's D.

        Following Gillespie "Population Genetics: A Concise Guide" pg. 45

        """
        if not self.Npop == 1:
            raise ValueError("Only defined on a one-dimensional SFS.")

        S = self.S()

        n = 1.0 * self.sample_sizes[0]
        pihat = self.pi()
        theta = self.Watterson_theta()

        a1 = np.sum(1.0 / np.arange(1, n))
        a2 = np.sum(1.0 / np.arange(1, n) ** 2)
        b1 = (n + 1) / (3 * (n - 1))
        b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1))
        c1 = b1 - 1.0 / a1
        c2 = b2 - (n + 2) / (a1 * n) + a2 / a1 ** 2

        C = np.sqrt((c1 / a1) * S + c2 / (a1 ** 2 + a2) * S * (S - 1))

        return (pihat - theta) / C

    # Functions for resampling or generating "data" from an SFS

[docs]    def fixed_size_sample(self, nsamples, include_masked=False):
        """
        Generate a resampled SFS from the current one. Thus, the resampled SFS
        follows a multinomial distribution given by the proportion of sites
        in each bin in the original SFS.

        :param nsamples: Number of samples to include in the new SFS.
        :type nsamples: int
        :param include_masked: If True, use all bins from the SFS. Otherwise,
            use only non-masked bins. Defaults to False.
        :type include_masked: bool, optional
        """
        flat = self.flatten()
        pvals = flat.data
        if include_masked is False:
            pvals[flat.mask] = 0
        pvals /= pvals.sum()

        sample = np.random.multinomial(int(nsamples), pvals)
        sample = sample.reshape(self.shape)
        sample = moments.Spectrum(sample, mask=self.mask, pop_ids=self.pop_ids)
        sample = sample.astype(int)

        return sample

[docs]    def sample(self):
        """
        Generate a Poisson-sampled fs from the current one.

        Entries where the current fs is masked or 0 will be masked in the
        output sampled fs.
        """
        # These are entries where the sampling has no meaning. Either the fs is
        # 0 there or masked.
        bad_entries = np.logical_or(self == 0, self.mask)
        # We convert to a 1-d array for passing into the sampler
        means = self.ravel().copy()
        # Filter out those bad entries.
        means[bad_entries.ravel()] = 1
        # Sample
        samp = scipy.stats.distributions.poisson.rvs(means, size=len(means))
        # Replace bad entries...
        samp[bad_entries.ravel()] = 0
        # Convert back to a properly shaped array
        samp = samp.reshape(self.shape)
        # Convert to a fs and mask the bad entries
        samp = Spectrum(
            samp, mask=bad_entries, data_folded=self.folded, pop_ids=self.pop_ids
        )
        samp = samp.astype(int)
        return samp

[docs]    def genotype_matrix(
        self, num_sites=None, sample_sizes=None, diploid_genotypes=False
    ):
        """
        Generate a genotype matrix of independent loci. For multi-population spectra,
        the individual columns are filled in the sample order as the populations
        in the SFS.

        .. note::
            Sites in the output genotype matrix are necessarily separated by
            infinite recombination. The SFS assumes all loci are segregating
            independently, so there is no linkage between them.

        Returns a genotype matrix of size number of sites by total sample size.

        :param num_sites: Defaults to None, in which case we take a poisson sample from
            the SFS. Otherwise, we take a fixed number of sites.
        :param sample_sizes: The sample size in each population, as a list with length
            of the number of dimension (populations) in the SFS.
        :diploid_genotypes: Defaults to False, in which case we return a haplotype
            matrix of size (num_sites x sum(sample_sizes)). If True, we return a
            diploid genotype matrix (filled with 0, 1, 2) of size
            (num_sites x sum(sample_sizes)/2).
        """
        # project to sample size as needed
        if sample_sizes is None:
            sample_sizes = self.sample_sizes
        proj = self.project(sample_sizes)
        if diploid_genotypes:
            for ss in proj.sample_sizes:
                if ss % 2 != 0:
                    raise ValueError(
                        "Requested diploid genotypes, but the haploid sample "
                        f"sizes ({sample_sizes}) are not all even."
                    )

        # get sampled SFS
        if num_sites is None:
            samp = proj.sample()
            num_sites = samp.sum()
        else:
            samp = proj.fixed_size_sample(num_sites)
        assert num_sites == samp.S()  # remove once tested

        # fill in genotype matrix
        G = np.zeros((num_sites, sum(proj.sample_sizes)), dtype=int)
        i = 0
        for idx in np.transpose(np.nonzero(samp)):
            for _ in range(samp[tuple(idx)]):
                g = []
                for j, n in zip(idx, sample_sizes):
                    g += list(np.random.permutation([1] * j + [0] * (n - j)))
                G[i] = g
                i += 1

        # collapse diploids if needed
        hap_sum = G.sum(axis=1)
        if diploid_genotypes:
            G = G[:, ::2] + G[:, 1::2]
        assert np.all(G.sum(axis=1) == hap_sum)  # remove once tested

        # shuffle genotype matrix
        np.random.shuffle(G)

        return G

    # Functions for saving and loading frequency spectra.

[docs]    @staticmethod
    def from_file(fid, mask_corners=True, return_comments=False):
        """
        Read frequency spectrum from file.

        See ``to_file`` for details on the file format.

        :param fid: string with file name to read from or an open file object.
        :type fid: string
        :param mask_corners: If True, mask the 'absent in all samples' and 'fixed in
            all samples' entries.
        :type mask_corners: bool, optional
        :param return_comments: If true, the return value is (fs, comments), where
            comments is a list of strings containing the comments
            from the file (without #'s).
        :type return_comments: bool, optional
        """
        newfile = False
        # Try to read from fid. If we can't, assume it's something that we can
        # use to open a file.
        if not hasattr(fid, "read"):
            newfile = True
            fid = open(fid, "r")

        line = fid.readline()
        # Strip out the comments
        comments = []
        while line.startswith("#"):
            comments.append(line[1:].strip())
            line = fid.readline()

        # Read the shape of the data
        shape_spl = line.split()
        if "folded" not in shape_spl and "unfolded" not in shape_spl:
            # This case handles the old file format
            shape = tuple([int(d) for d in shape_spl])
            folded = False
            pop_ids = None
        else:
            # This case handles the new file format
            shape, next_ii = [int(shape_spl[0])], 1
            while shape_spl[next_ii] not in ["folded", "unfolded"]:
                shape.append(int(shape_spl[next_ii]))
                next_ii += 1
            folded = shape_spl[next_ii] == "folded"
            # Are there population labels in the file?
            if len(shape_spl) > next_ii + 1:
                pop_ids = line.split('"')[1::2]
            else:
                pop_ids = None

        data = np.fromstring(fid.readline().strip(), count=np.product(shape), sep=" ")
        # fromfile returns a 1-d array. Reshape it to the proper form.
        data = data.reshape(*shape)

        maskline = fid.readline().strip()
        if not maskline:
            # The old file format didn't have a line for the mask
            mask = np.ma.nomask
        else:
            # This case handles the new file format
            mask = np.fromstring(maskline, count=np.product(shape), sep=" ")
            mask = mask.reshape(*shape)

        # If we opened a new file, clean it up.
        if newfile:
            fid.close()

        fs = Spectrum(data, mask, mask_corners, data_folded=folded, pop_ids=pop_ids)

        if not return_comments:
            return fs
        else:
            return fs, comments

    fromfile = from_file

[docs]    def to_file(self, fid, precision=16, comment_lines=[], foldmaskinfo=True):
        """
        Write frequency spectrum to file.

        The file format is:

        - Any number of comment lines beginning with a '#'
        - A single line containing N integers giving the dimensions of the fs
          array. So this line would be '5 5 3' for an SFS that was 5x5x3.
          (That would be 4x4x2 *samples*.)
        - On the *same line*, the string 'folded' or 'unfolded' denoting the
          folding status of the array
        - On the *same line*, optional strings each containing the population
          labels in quotes separated by spaces, e.g. "pop 1" "pop 2"
        - A single line giving the array elements. The order of elements is
          e.g.: fs[0,0,0] fs[0,0,1] fs[0,0,2] ... fs[0,1,0] fs[0,1,1] ...
        - A single line giving the elements of the mask in the same order as
          the data line. '1' indicates masked, '0' indicates unmasked.

        :param fid: string with file name to write to or an open file object.
        :type fid: string
        :param precision: precision with which to write out entries of the SFS. (They
            are formated via %.<p>g, where <p> is the precision.) Defaults to 16.
        :type precision: int, optional
        :param comment_lines: list of strings to be used as comment lines in the header
            of the output file.
        :type comment_lines: list of strings, optional
        :param foldmaskinfo: If False, folding and mask and population label
            information will not be saved.
        :type foldmaskinfo: bool, optional
        """
        # Open the file object.
        newfile = False
        if not hasattr(fid, "write"):
            newfile = True
            fid = open(fid, "w")

        # Write comments
        for line in comment_lines:
            fid.write("# ")
            fid.write(line.strip())
            fid.write(os.linesep)

        # Write out the shape of the fs
        for elem in self.data.shape:
            fid.write("%i " % elem)

        if foldmaskinfo:
            if not self.folded:
                fid.write("unfolded")
            else:
                fid.write("folded")
            if self.pop_ids is not None:
                for label in self.pop_ids:
                    fid.write(' "%s"' % label)

        fid.write(os.linesep)

        # Write the data to the file
        self.data.tofile(fid, " ", "%%.%ig" % precision)
        fid.write(os.linesep)

        if foldmaskinfo:
            # Write the mask to the file
            np.asarray(self.mask, int).tofile(fid, " ")
            fid.write(os.linesep)

        # Close file
        if newfile:
            fid.close()

    ## Overide the (perhaps confusing) original np tofile method.
    tofile = to_file

[docs]    @staticmethod
    def from_ms_file(
        fid,
        average=True,
        mask_corners=True,
        return_header=False,
        pop_assignments=None,
        pop_ids=None,
        bootstrap_segments=1,
    ):
        """
        Read frequency spectrum from file of ms output.

        :param fid: string with file name to read from or an open file object.
        :param average: If True, the returned fs is the average over the runs in the ms
            file. If False, the returned fs is the sum.
        :param mask_corners: If True, mask the 'absent in all samples' and 'fixed in
            all samples' entries.
        :param return_header: If True, the return value is (fs, (command,seeds), where
            command and seeds are strings containing the ms
            commandline and the seeds used.
        :param pop_assignments: If None, the assignments of samples to populations is
            done automatically, using the assignment in the ms
            command line. To manually assign populations, pass a
            list of the from [6,8]. This example places
            the first 6 samples into population 1, and the next 8
            into population 2.
        :param pop_ids: Optional list of strings containing the population labels.
            If pop_ids is None, labels will be "pop0", "pop1", ...
        :param bootstrap_segments: If bootstrap_segments is an integer greater than 1,
            the data will be broken up into that many segments
            based on SNP position. Instead of single FS, a list
            of spectra will be returned, one for each segment.
        """
        warnings.warn(
            "Spectrum.from_ms_file() is deprecated and will be removed in version 1.2",
            warnings.DeprecationWarning,
        )
        newfile = False
        # Try to read from fid. If we can't, assume it's something that we can
        # use to open a file.
        if not hasattr(fid, "read"):
            newfile = True
            fid = open(fid, "r")

        # Parse the commandline
        command = line = fid.readline()
        command_terms = line.split()

        if command_terms[0].count("ms"):
            runs = int(command_terms[2])
            try:
                pop_flag = command_terms.index("-I")
                num_pops = int(command_terms[pop_flag + 1])
                pop_samples = [
                    int(command_terms[pop_flag + ii]) for ii in range(2, 2 + num_pops)
                ]
            except ValueError:
                num_pops = 1
                pop_samples = [int(command_terms[1])]
        else:
            raise ValueError("Unrecognized command string: %s." % command)

        total_samples = np.sum(pop_samples)
        if pop_assignments:
            num_pops = len(pop_assignments)
            pop_samples = pop_assignments

        sample_indices = np.cumsum([0] + pop_samples)
        bottom_l = sample_indices[:-1]
        top_l = sample_indices[1:]

        seeds = line = fid.readline()
        while not line.startswith("//"):
            line = fid.readline()

        counts = np.zeros(len(pop_samples), np.int_)
        fs_shape = np.asarray(pop_samples) + 1
        dimension = len(counts)

        if dimension > 1:
            bottom0 = bottom_l[0]
            top0 = top_l[0]
            bottom1 = bottom_l[1]
            top1 = top_l[1]
        if dimension > 2:
            bottom2 = bottom_l[2]
            top2 = top_l[2]
        if dimension > 3:
            bottom3 = bottom_l[3]
            top3 = top_l[3]
        if dimension > 4:
            bottom4 = bottom_l[4]
            top4 = top_l[4]
        if dimension > 5:
            bottom5 = bottom_l[5]
            top5 = top_l[5]

        all_data = [
            np.zeros(fs_shape, np.int_) for boot_ii in range(bootstrap_segments)
        ]
        for run_ii in range(runs):
            line = fid.readline()
            segsites = int(line.split()[-1])

            if segsites == 0:
                # Special case, need to read 3 lines to stay synced.
                for _ in range(3):
                    line = fid.readline()
                continue
            line = fid.readline()
            while not line.startswith("positions"):
                line = fid.readline()

            # Read SNP positions for creating bootstrap segments
            positions = [float(_) for _ in line.split()[1:]]
            # Where we should break our interval to create our bootstraps
            breakpts = np.linspace(0, 1, bootstrap_segments + 1)
            # The indices that correspond to those breakpoints
            break_iis = np.searchsorted(positions, breakpts)
            # Correct for searchsorted behavior if last position is 1,
            # to ensure all SNPs are captured
            break_iis[-1] = len(positions)

            # Read the chromosomes in
            chromos = fid.read((segsites + 1) * total_samples)

            # For each bootstrap segment, relevant SNPs run from start_ii:end_ii
            for boot_ii, (start_ii, end_ii) in enumerate(
                zip(break_iis[:-1], break_iis[1:])
            ):
                # Use the data array corresponding to this bootstrap segment
                data = all_data[boot_ii]
                for snp in range(start_ii, end_ii):
                    # Slice to get all the entries that refer to a given SNP
                    this_snp = chromos[snp :: segsites + 1]
                    # Count SNPs per population, and record them.
                    if dimension == 1:
                        data[this_snp.count("1")] += 1
                    elif dimension == 2:
                        data[
                            this_snp[bottom0:top0].count("1"),
                            this_snp[bottom1:top1].count("1"),
                        ] += 1
                    elif dimension == 3:
                        data[
                            this_snp[bottom0:top0].count("1"),
                            this_snp[bottom1:top1].count("1"),
                            this_snp[bottom2:top2].count("1"),
                        ] += 1
                    elif dimension == 4:
                        data[
                            this_snp[bottom0:top0].count("1"),
                            this_snp[bottom1:top1].count("1"),
                            this_snp[bottom2:top2].count("1"),
                            this_snp[bottom3:top3].count("1"),
                        ] += 1
                    elif dimension == 5:
                        data[
                            this_snp[bottom0:top0].count("1"),
                            this_snp[bottom1:top1].count("1"),
                            this_snp[bottom2:top2].count("1"),
                            this_snp[bottom3:top3].count("1"),
                            this_snp[bottom4:top4].count("1"),
                        ] += 1
                    elif dimension == 6:
                        data[
                            this_snp[bottom0:top0].count("1"),
                            this_snp[bottom1:top1].count("1"),
                            this_snp[bottom2:top2].count("1"),
                            this_snp[bottom3:top3].count("1"),
                            this_snp[bottom4:top4].count("1"),
                            this_snp[bottom5:top5].count("1"),
                        ] += 1
                    else:
                        # This is noticably slower, so we special case the cases
                        # above.
                        for dim_ii in range(dimension):
                            bottom = bottom_l[dim_ii]
                            top = top_l[dim_ii]
                            counts[dim_ii] = this_snp[bottom:top].count("1")
                        data[tuple(counts)] += 1

            # Read to the next iteration
            line = fid.readline()
            line = fid.readline()

        if newfile:
            fid.close()

        all_fs = [
            Spectrum(data, mask_corners=mask_corners, pop_ids=pop_ids)
            for data in all_data
        ]
        if average:
            all_fs = [fs / runs for fs in all_fs]

        # If we aren't setting up for bootstrapping, return fs, rather than a
        # list of length 1. (This ensures backward compatibility.)
        if bootstrap_segments == 1:
            all_fs = all_fs[0]

        if not return_header:
            return all_fs
        else:
            return all_fs, (command, seeds)

    @staticmethod
    def _from_sfscode_file(
        fid,
        sites="all",
        average=True,
        mask_corners=True,
        return_header=False,
        pop_ids=None,
    ):
        """
        Read frequency spectrum from file of sfs_code output.

        :param fid: string with file name to read from or an open file object.
        :param sites: If sites=='all', return the fs of all sites. If sites == 'syn',
            use only synonymous mutations. If sites == 'nonsyn', use
            only non-synonymous mutations.
        :param average: If True, the returned fs is the average over the runs in the
            file. If False, the returned fs is the sum.
        :param mask_corners: If True, mask the 'absent in all samples' and 'fixed in
            all samples' entries.
        :param return_header: If true, the return value is (fs, (command,seeds), where
            command and seeds are strings containing the ms
            commandline and the seeds used.
        :param pop_ids: Optional list of strings containing the population labels.
            If pop_ids is None, labels will be "pop0", "pop1", ...
        """
        warnings.warn(
            "Spectrum.from_sfscode_file() is deprecated and will be removed in ver 1.2",
            warnings.DeprecationWarning,
        )
        newfile = False
        # Try to read from fid. If we can't, assume it's something that we can
        # use to open a file.
        if not hasattr(fid, "read"):
            newfile = True
            fid = open(fid, "r")

        if sites == "all":
            only_nonsyn, only_syn = False, False
        elif sites == "syn":
            only_nonsyn, only_syn = False, True
        elif sites == "nonsyn":
            only_nonsyn, only_syn = True, False
        else:
            raise ValueError(
                "'sites' argument must be one of ('all', 'syn', " "'nonsyn')."
            )

        command = fid.readline()
        command_terms = command.split()

        runs = int(command_terms[2])
        num_pops = int(command_terms[1])

        # sfs_code default is 6 individuals, and I assume diploid pop
        pop_samples = [12] * num_pops
        if "--sampSize" in command_terms or "-n" in command_terms:
            try:
                pop_flag = command_terms.index("--sampSize")
                pop_flag = command_terms.index("-n")
            except ValueError:
                pass
            pop_samples = [
                2 * int(command_terms[pop_flag + ii]) for ii in range(1, 1 + num_pops)
            ]

        pop_samples = np.asarray(pop_samples)
        pop_digits = [str(i) for i in range(num_pops)]
        pop_fixed_str = [",%s.-1" % i for i in range(num_pops)]
        pop_count_str = [",%s." % i for i in range(num_pops)]

        seeds = fid.readline()
        line = fid.readline()

        data = np.zeros(np.asarray(pop_samples) + 1, np.int_)

        # line = //iteration...
        line = fid.readline()
        for iter_ii in range(runs):
            for ii in range(5):
                line = fid.readline()

            # It is possible for a mutation to be listed several times in the
            # output.  To accomodate this, I keep a dictionary of identities
            # for those mutations, and hold off processing them until I've seen
            # all mutations listed for the iteration.
            mut_dict = {}

            # Loop until this iteration ends.
            while not line.startswith("//") and line != "":
                split_line = line.split(";")
                if split_line[-1] == "\n":
                    split_line = split_line[:-1]

                # Loop over mutations on this line.
                for mut_ii, mutation in enumerate(split_line):
                    counts_this_mut = np.zeros(num_pops, np.int_)

                    split_mut = mutation.split(",")

                    # Exclude synonymous mutations
                    if only_nonsyn and split_mut[7] == "0":
                        continue
                    # Exclude nonsynonymous mutations
                    if only_syn and split_mut[7] == "1":
                        continue

                    ind_start = len(",".join(split_mut[:12]))
                    by_individual = mutation[ind_start:]

                    mut_id = ",".join(split_mut[:4] + split_mut[5:11])

                    # Count mutations in each population
                    for pop_ii, fixed_str, count_str in zip(
                        range(num_pops), pop_fixed_str, pop_count_str
                    ):
                        if fixed_str in by_individual:
                            counts_this_mut[pop_ii] = pop_samples[pop_ii]
                        else:
                            counts_this_mut[pop_ii] = by_individual.count(count_str)

                    # Initialize the list that will track the counts for this
                    # mutation. Using setdefault means that it won't overwrite
                    # if there's already a list stored there.
                    mut_dict.setdefault(mut_id, [0] * num_pops)
                    for ii in range(num_pops):
                        if counts_this_mut[ii] > 0 and mut_dict[mut_id][ii] > 0:
                            sys.stderr.write(
                                "Contradicting counts between "
                                "listings for mutation %s in "
                                "population %i." % (mut_id, ii)
                            )
                        mut_dict[mut_id][ii] = max(
                            counts_this_mut[ii], mut_dict[mut_id][ii]
                        )

                line = fid.readline()

            # Now apply all the mutations with fixations that we deffered.
            for mut_id, counts in mut_dict.items():
                if np.any(np.asarray(counts) > pop_samples):
                    sys.stderr.write(
                        "counts_this_mut > pop_samples: %s > "
                        "%s\n%s\n" % (counts, pop_samples, mut_id)
                    )
                    counts = np.minimum(counts, pop_samples)
                data[tuple(counts)] += 1

        if newfile:
            fid.close()

        fs = Spectrum(data, mask_corners=mask_corners, pop_ids=pop_ids)
        if average:
            fs /= runs

        if not return_header:
            return fs
        else:
            return fs, (command, seeds)

[docs]    def scramble_pop_ids(self, mask_corners=True):
        """
        Spectrum corresponding to scrambling individuals among populations.

        This is useful for assessing how diverged populations are.
        Essentially, it pools all the individuals represented in the fs and
        generates new populations of random individuals (without replacement)
        from that pool. If this fs is significantly different from the
        original, that implies population structure.
        """
        original_folded = self.folded
        # If we started with an folded Spectrum, we need to unfold before
        # projecting.
        if original_folded:
            self = self.unfold()

        total_samp = np.sum(self.sample_sizes)

        # First generate a 1d sfs for the pooled population.
        combined = np.zeros(total_samp + 1)
        # For each entry in the fs, this is the total number of derived alleles
        total_per_entry = self._total_per_entry()
        # Sum up to generate the equivalent 1-d spectrum.
        for derived, counts in zip(total_per_entry.ravel(), self.ravel()):
            combined[derived] += counts

        # Now resample back into a n-d spectrum
        # For each entry, this is the counts per popuation.
        #  e.g. counts_per_entry[3,4,5] = [3,4,5]
        counts_per_entry = self._counts_per_entry()
        # Reshape it to be 1-d, so we can iterate over it easily.
        counts_per_entry = counts_per_entry.reshape(np.prod(self.shape), self.ndim)
        resamp = np.zeros(self.shape)
        for counts, derived in zip(counts_per_entry, total_per_entry.ravel()):
            # The probability here is
            # (t1 choose d1)*(t2 choose d2)/(ntot choose derived)
            lnprob = sum(
                Numerics._lncomb(t, d) for t, d in zip(self.sample_sizes, counts)
            )
            lnprob -= Numerics._lncomb(total_samp, derived)
            prob = np.exp(lnprob)
            # Assign result using the appropriate weighting
            resamp[tuple(counts)] += prob * combined[derived]

        resamp = Spectrum(resamp, mask_corners=mask_corners)
        if not original_folded:
            return resamp
        else:
            return resamp.fold()

[docs]    @staticmethod
    def from_data_dict(
        data_dict, pop_ids, projections, mask_corners=True, polarized=True
    ):
        """
        Spectrum from a dictionary of polymorphisms.

        The data dictionary should be organized as:

        .. code-block::

            {snp_id: {
                'segregating': ['A','T'],
                'calls': {
                    'YRI': (23,3),
                    'CEU': (7,3)
                },
                'outgroup_allele': 'T'
            }}

        The 'calls' entry gives the successful calls in each population, in the
        order that the alleles are specified in 'segregating'.
        Non-diallelic polymorphisms are skipped.

        :param pop_ids: list of which populations to make fs for.
        :param projections: list of sample sizes to project down to for each
            population.
        :param polarized: If True, the data are assumed to be correctly polarized by
            'outgroup_allele'. SNPs in which the 'outgroup_allele'
            information is missing or '-' or not concordant with the
            segregating alleles will be ignored.
            If False, any 'outgroup_allele' info present is ignored,
            and the returned spectrum is folded.
        """
        Npops = len(pop_ids)
        fs = np.zeros(np.asarray(projections) + 1)
        for snp, snp_info in data_dict.items():
            # Skip SNPs that aren't biallelic.
            if len(snp_info["segregating"]) != 2:
                continue

            allele1, allele2 = snp_info["segregating"]
            if not polarized:
                # If we don't want to polarize, we can choose which allele is
                # derived arbitrarily since we'll fold anyways.
                outgroup_allele = allele1
            elif (
                "outgroup_allele" in snp_info
                and snp_info["outgroup_allele"] != "-"
                and snp_info["outgroup_allele"] in snp_info["segregating"]
            ):
                # Otherwise we need to check that it's a useful outgroup
                outgroup_allele = snp_info["outgroup_allele"]
            else:
                # If we're polarized and we didn't have good outgroup info, skip
                # this SNP.
                continue

            # Extract the allele calls for each population.
            allele1_calls = [snp_info["calls"][pop][0] for pop in pop_ids]
            allele2_calls = [snp_info["calls"][pop][1] for pop in pop_ids]
            # How many chromosomes did we call successfully in each population?
            successful_calls = [
                a1 + a2 for (a1, a2) in zip(allele1_calls, allele2_calls)
            ]

            # Which allele is derived (different from outgroup)?
            if allele1 == outgroup_allele:
                derived_calls = allele2_calls
            elif allele2 == outgroup_allele:
                derived_calls = allele1_calls

            # To handle arbitrary numbers of populations in the fs, we need
            # to do some tricky slicing.
            slices = [[np.newaxis] * len(pop_ids) for ii in range(Npops)]
            for ii in range(len(pop_ids)):
                slices[ii][ii] = slice(None, None, None)

            # Do the projection for this SNP.
            pop_contribs = []
            iter = zip(projections, successful_calls, derived_calls)
            for pop_ii, (p_to, p_from, hits) in enumerate(iter):
                contrib = Numerics._cached_projection(p_to, p_from, hits)[
                    tuple(slices[pop_ii])
                ]
                pop_contribs.append(contrib)
            fs += functools.reduce(operator.mul, pop_contribs)
        fsout = Spectrum(fs, mask_corners=mask_corners, pop_ids=pop_ids)
        assert np.all(fsout >= 0)
        if polarized:
            if np.sum(fsout) == 0:
                warnings.warn(
                    "Spectrum is empty. Did you compute the outgroup alleles "
                    "at variable sites?",
                    warnings.RuntimeWarning,
                )
            return fsout
        else:
            if np.sum(fsout) == 0:
                warnings.warn(
                    "Spectrum is empty. Check input data dictionary.",
                    warnings.RuntimeWarning,
                )
            return fsout.fold()

    @staticmethod
    def _from_count_dict(count_dict, projections, polarized=True, pop_ids=None):
        """
        Frequency spectrum from data mapping SNP configurations to counts.

        :param count_dict: Result of Misc.count_data_dict
        :param projections: List of sample sizes to project down to for each
            population.
        :param polarized: If True, only include SNPs that count_dict marks as polarized.
            If False, include all SNPs and fold resulting Spectrum.
        :param pop_ids: Optional list of strings containing the population labels.
        """

        # create slices for projection calculation
        slices = [[np.newaxis] * len(projections) for ii in range(len(projections))]
        for ii in range(len(projections)):
            slices[ii][ii] = slice(None, None, None)

        fs_total = moments.Spectrum(
            np.zeros(np.array(projections) + 1), pop_ids=pop_ids
        )
        for (
            (called_by_pop, derived_by_pop, this_snp_polarized),
            count,
        ) in count_dict.items():
            if polarized and not this_snp_polarized:
                continue
            pop_contribs = []
            iter = zip(projections, called_by_pop, derived_by_pop)
            for pop_ii, (p_to, p_from, hits) in enumerate(iter):
                contrib = Numerics._cached_projection(p_to, p_from, hits)[
                    slices[pop_ii]
                ]
                pop_contribs.append(contrib)
            fs_proj = functools.reduce(operator.mul, pop_contribs)

            # create slices for adding projected fs to overall fs
            fs_total += count * fs_proj
        if polarized:
            return fs_total
        else:
            return fs_total.fold()

    @staticmethod
    def _data_by_tri(data_dict):
        """
        Nest the data by derived context and outgroup base.

        The resulting dictionary contains only SNPs which are appropriate for
        use of Hernandez's ancestral misidentification correction. It is
        organized as {(derived_tri, outgroup_base): {snp_id: data,...}}
        """
        result = {}
        genetic_bases = "ACTG"
        for snp, snp_info in data_dict.items():
            # Skip non-diallelic polymorphisms
            if len(snp_info["segregating"]) != 2:
                continue
            allele1, allele2 = snp_info["segregating"]
            # Filter out SNPs where we either non-constant ingroup or outgroup
            # context.
            try:
                ingroup_tri = snp_info["context"]
                outgroup_tri = snp_info["outgroup_context"]
            except KeyError:
                continue
            if not outgroup_tri[1] == snp_info["outgroup_allele"]:
                raise ValueError(
                    "Outgroup context and allele are inconsistent "
                    "for polymorphism: %s." % snp
                )
            outgroup_allele = outgroup_tri[1]

            # These are all the requirements to apply the ancestral correction.
            # First 2 are constant context.
            # Next 2 are sensible context.
            # Next 1 is that outgroup allele is one of the segregating.
            # Next 2 are that segregating alleles are sensible.
            if (
                outgroup_tri[0] != ingroup_tri[0]
                or outgroup_tri[2] != ingroup_tri[2]
                or ingroup_tri[0] not in genetic_bases
                or ingroup_tri[2] not in genetic_bases
                or outgroup_allele not in [allele1, allele2]
                or allele1 not in genetic_bases
                or allele2 not in genetic_bases
            ):
                continue

            if allele1 == outgroup_allele:
                derived_allele = allele2
            elif allele2 == outgroup_allele:
                # In this case, the second allele is non_outgroup
                derived_allele = allele1
            derived_tri = ingroup_tri[0] + derived_allele + ingroup_tri[2]
            result.setdefault((derived_tri, outgroup_allele), {})
            result[derived_tri, outgroup_allele][snp] = snp_info
        return result

    @staticmethod
    def _from_data_dict_corrected(
        data_dict, pop_ids, projections, fux_filename, force_pos=True, mask_corners=True
    ):
        """
        Spectrum from a dictionary of polymorphisms, corrected for ancestral
        misidentification.

        The correction is based upon Hernandez, Williamson & Bustamante *Mol Biol Evol*
        24:1792 (2007)

        :param force_pos: If the correction is too agressive, it may leave some small
            entries in the fs less than zero. If force_pos is true,
            these entries will be set to zero, in such a way that the
            total number of segregating SNPs is conserved.
        :param fux_filename: The name of the file containing the
            misidentification probabilities.
            The file is of the form:
            - # Any number of comments lines beginning with #
            - AAA T 0.001
            - AAA G 0.02
            - ...
            Where every combination of three + one bases is considered
            (order is not important).  The triplet is the context and
            putatively derived allele (x) in the reference species. The
            single base is the base (u) in the outgroup. The numerical
            value is 1-f_{ux} in the notation of the paper.

        The data dictionary should be organized as:
            {snp_id:{'segregating': ['A','T'],
                     'calls': {'YRI': (23,3),
                                'CEU': (7,3)
                                },
                     'outgroup_allele': 'T',
                     'context': 'CAT',
                     'outgroup_context': 'CAT'
                    }
            }
        The additional entries are 'context', which includes the two flanking
        bases in the species of interest, and 'outgroup_context', which
        includes the aligned bases in the outgroup.

        This method skips entries for which the correction cannot be applied.
        Most commonly this is because of missing or non-constant context.
        """
        # Read the fux file into a dictionary.
        fux_dict = {}
        f = open(fux_filename)
        for line in f.readlines():
            if line.startswith("#"):
                continue
            sp = line.split()
            fux_dict[(sp[0], sp[1])] = 1 - float(sp[2])
        f.close()

        # Divide the data into classes based on ('context', 'outgroup_allele')
        by_context = Spectrum._data_by_tri(data_dict)

        fs = np.zeros(np.asarray(projections) + 1)
        while by_context:
            # Each time through this loop, we eliminate two entries from the
            # data dictionary. These correspond to one class and its
            # corresponding misidentified class.
            (derived_tri, out_base), nomis_data = by_context.popitem()

            # The corresponding bases if the ancestral state had been
            # misidentifed.
            mis_out_base = derived_tri[1]
            mis_derived_tri = derived_tri[0] + out_base + derived_tri[2]
            # Get the data for that case. Note that we default to an empty
            # dictionary if we don't have data for that class.
            mis_data = by_context.pop((mis_derived_tri, mis_out_base), {})

            fux = fux_dict[(derived_tri, out_base)]
            fxu = fux_dict[(mis_derived_tri, mis_out_base)]

            # Get the spectra for these two cases
            Nux = Spectrum.from_data_dict(nomis_data, pop_ids, projections)
            Nxu = Spectrum.from_data_dict(mis_data, pop_ids, projections)

            # Equations 5 & 6 from the paper.
            Nxu_rev = Numerics.reverse_array(Nxu)
            Rux = (fxu * Nux - (1 - fxu) * Nxu_rev) / (fux + fxu - 1)
            Rxu = Numerics.reverse_array(
                (fux * Nxu_rev - (1 - fux) * Nux) / (fux + fxu - 1)
            )

            fs += Rux + Rxu

        # Here we take the negative entries, and flip them back, so they end up
        # zero and the total number of SNPs is conserved.
        if force_pos:
            negative_entries = np.minimum(0, fs)
            fs -= negative_entries
            fs += Numerics.reverse_array(negative_entries)

        return Spectrum(fs, mask_corners=mask_corners, pop_ids=pop_ids)

[docs]    @staticmethod
    def from_demes(
        g,
        sampled_demes=None,
        sample_sizes=None,
        sample_times=None,
        samples=None,
        Ne=None,
        unsampled_n=4,
        gamma=None,
        h=None,
    ):
        """
        Takes a deme graph and computes the SFS. ``demes`` is a package for
        specifying demographic models in a user-friendly, human-readable YAML
        format. This function automatically parses the demographic description
        and returns a SFS for the specified populations and sample sizes.

        .. note::

            If a deme sample time is requested that is earlier than the deme's
            end time, for example to simulate ancient samples, we must create a
            new population for that ancient sample. This can cause large
            slow-downs, as the computation cost of computing the SFS grows
            quickly in the number of populations.

        :param g: A ``demes`` DemeGraph from which to compute the SFS. The DemeGraph
            can either be specified as a YAML file, in which case `g` is a string,
            or as a ``DemeGraph`` object.
        :type g: str or :class:`demes.DemeGraph`
        :param sampled_demes: A list of deme IDs to take samples from. We can repeat
            demes, as long as the sampling of repeated deme IDs occurs at distinct
            times.
        :type sampled_demes: list of strings
        :param sample_sizes: A list of the same length as ``sampled_demes``,
            giving the sample sizes for each sampled deme.
        :type sample_sizes: list of ints
        :param sample_times: If None, assumes all sampling occurs at the end of the
            existence of the sampled deme. If there are
            ancient samples, ``sample_times`` must be a list of same length as
            ``sampled_demes``, giving the sampling times for each sampled
            deme. Sampling times are given in time units of the original deme graph,
            so might not necessarily be generations (e.g. if ``g.time_units`` is years)
        :type sample_times: list of floats, optional
        :param Ne: reference population size. If none is given, we use the initial
            size of the root deme.
        :type Ne: float, optional
        :param unsampled_n: The default sample size of unsampled demes, which must be
            greater than or equal to 4.
        :type unsampled_n: int, optional
        :param gamma: The scaled selection coefficient(s), 2*Ne*s. Defaults to None,
            which implies neutrality. Can be given as a scalar value, in which case
            all populations have the same selection coefficient. Alternatively, can
            be given as a dictionary, with keys given as population names in the
            input Demes model. Any population missing from this dictionary will be
            assigned a selection coefficient of zero. A non-zero default selection
            coefficient can be provided, using the key `_default`. See the Demes
            exension documentation for more details and examples.
        :type gamma: float or dict
        :param h: The dominance coefficient(s). Defaults to additivity (or genic
            selection). Can be given as a scalar value, in which case all populations
            have the same dominance coefficient. Alternatively, can be given as a
            dictionary, with keys given as population names in the input Demes model.
            Any population missing from this dictionary will be assigned a dominance
            coefficient of 1/2 (additivity). A different default dominance
            coefficient can be provided, using the key `_default`. See the Demes
            exension documentation for more details and examples.
        :type h: float or dict
        :return: A ``moments`` site frequency spectrum, with dimension equal to the
            length of ``sampled_demes``, and shape equal to ``sample_sizes`` plus one
            in each dimension, indexing the allele frequency in each deme from 0
            to n[i], where i is the deme index.
        :rtype: :class:`moments.Spectrum`
        """
        if isinstance(g, str):
            dg = demes.load(g)
        else:
            dg = g

        if samples is None:
            if sampled_demes is None or sample_sizes is None:
                raise ValueError(
                    "must specify either samples (as a dict mapping demes to sample sizes,"
                    " or specify both sampled_demes and sample_times"
                )
        elif samples is not None:
            if type(samples) is not dict:
                raise ValueError("samples must be a dict mapping demes to sample sizes")
            if sampled_demes is not None or sample_sizes is not None:
                raise ValueError(
                    "if samples is given as dict, cannot "
                    "specify sampled_demes or sample_sizes"
                )
            if sample_times is not None:
                raise ValueError(
                    "if samples is given as dict, cannot specify sample times"
                )
            sampled_demes = list(samples.keys())
            sample_sizes = list(samples.values())

        fs = Demes.SFS(
            dg,
            sampled_demes,
            sample_sizes,
            sample_times=sample_times,
            Ne=Ne,
            unsampled_n=unsampled_n,
            gamma=gamma,
            h=h,
        )
        return fs

[docs]    @staticmethod
    def from_angsd(
        sfs_file, sample_sizes, pop_ids=None, folded=False, mask_corners=True
    ):
        """
        Convert ANGSD output to a moments Spectrum object. The sample sizes
        are given as number of haploid genome copies (twice the number of
        sampled diploid individuals).

        :param string sfs_file: The n-dimensional SFS from ANGSD. This should be a
            file with a single line of numbers, as entries in the SFS.
        :param list sample_sizes: A list of integers with length equal to the number
            of population, storing the haploid sample size in each population.
            The order must match the population order provided to ANGSD.
        :param list pop_ids: A list of strings equal with length equal to the number
            of population, specifying the population name for each.
        :param bool folded: If False (default), we assume ancestral states are
            known, returning an unfolded SFS. If True, the returned SFS is folded.
        :param bool mask_corners: If True (default), mask the fixed bins in the
            SFS. If False, the fixed bins will remain unmasked.
        :return: A ``moments`` site frequency spectrum.
        :rtype: :class:`moments.Spectrum`
        """
        if pop_ids is not None and len(pop_ids) != len(sample_sizes):
            raise ValueError(
                "length of pop ids must equal the number of "
                "populations given by sample sizes"
            )

        if len(sample_sizes) > 2:
            warnings.warn(
                "Importing ANGSD-formatted data has only been tested for 2 populations",
                warnings.UserWarning,
            )
        # get the SFS data
        with open(sfs_file) as f_data:
            data_line = f_data.readline()

        data = np.array([float(d) for d in data_line.split()])

        # check that the length of data matches the number of bins in the SFS
        reshape_dims = [n + 1 for n in sample_sizes]
        if np.prod(reshape_dims) != len(data):
            raise ValueError("size of data does not match input sample sizes")

        data = np.reshape(data, reshape_dims)

        # turn into moments object
        fs = moments.Spectrum(data, mask_corners=mask_corners, pop_ids=pop_ids)
        if folded:
            fs = fs.fold()
        return fs

    # The code below ensures that when I do arithmetic with Spectrum objects,
    # it is not done between a folded and an unfolded array. If it is, I raise
    # a ValueError.

    # While I'm at it, I'm also fixing the annoying behavior that if a1 and a2
    # are masked arrays, and a3 = a1 + a2. Then wherever a1 or a2 was masked,
    # a3.data ends up with the a1.data values, rather than a1.data + a2.data.
    # Note that this fix doesn't work for operation by np.ma.exp and
    # np.ma.log. Guess I can't have everything.

    # I'm using exec here to avoid copy-pasting a dozen boiler-plate functions.
    # The calls to check_folding_equal ensure that we don't try to combine
    # folded and unfolded Spectrum objects.

    # I set check_folding = False in the constructor because it raises useless
    # warnings when, for example, I do (model + 1).

    # These functions also ensure that the pop_ids
    # get properly copied over.

    # This is pretty advanced Python voodoo, so don't fret if you don't
    # understand it at first glance. :-)
    for method in [
        "__add__",
        "__radd__",
        "__sub__",
        "__rsub__",
        "__mul__",
        "__rmul__",
        "__div__",
        "__rdiv__",
        "__truediv__",
        "__rtruediv__",
        "__floordiv__",
        "__rfloordiv__",
        "__rpow__",
        "__pow__",
    ]:
        exec(
            """
def %(method)s(self, other):
    self._check_other_folding(other)
    if isinstance(other, np.ma.masked_array):
        newdata = self.data.%(method)s (other.data)
        newmask = np.ma.mask_or(self.mask, other.mask)
    else:
        newdata = self.data.%(method)s (other)
        newmask = self.mask
    newpop_ids = self.pop_ids
    if hasattr(other, 'pop_ids'):
        if other.pop_ids is None:
            newpop_ids = self.pop_ids
        elif self.pop_ids is None:
            newpop_ids = other.pop_ids
        elif other.pop_ids != self.pop_ids:
            logger.warn('Arithmetic between Spectra with different pop_ids. '
                        'Resulting pop_id may not be correct.')
    outfs = self.__class__.__new__(self.__class__, newdata, newmask, 
                                   mask_corners=False, data_folded=self.folded,
                                   check_folding=False, pop_ids=newpop_ids)
    return outfs
"""
            % {"method": method}
        )

    # Methods that modify the Spectrum in-place.
    for method in [
        "__iadd__",
        "__isub__",
        "__imul__",
        "__idiv__",
        "__itruediv__",
        "__ifloordiv__",
        "__ipow__",
    ]:
        exec(
            """
def %(method)s(self, other):
    self._check_other_folding(other)
    if isinstance(other, np.ma.masked_array):
        self.data.%(method)s (other.data)
        self.mask = np.ma.mask_or(self.mask, other.mask)
    else:
        self.data.%(method)s (other)
    if hasattr(other, 'pop_ids') and other.pop_ids is not None\
             and other.pop_ids != self.pop_ids:
        logger.warn('Arithmetic between Spectra with different pop_ids. '
                    'Resulting pop_id may not be correct.')
    return self
"""
            % {"method": method}
        )

    def _check_other_folding(self, other):
        """
        Ensure other Spectrum has same .folded status
        """
        if isinstance(other, self.__class__) and other.folded != self.folded:
            raise ValueError(
                "Cannot operate with a folded Spectrum and an " "unfolded one."
            )


# Allow spectrum objects to be pickled.
# See http://effbot.org/librarybook/copy-reg.htm
try:
    import copy_reg

    def Spectrum_unpickler(data, mask, data_folded, pop_ids):
        return moments.Spectrum(
            data,
            mask,
            mask_corners=False,
            data_folded=data_folded,
            check_folding=False,
            pop_ids=pop_ids,
        )

    def Spectrum_pickler(fs):
        return (
            Spectrum_unpickler,
            (fs.data, fs.mask, fs.folded, fs.pop_ids),
        )

    copy_reg.pickle(Spectrum, Spectrum_pickler, Spectrum_unpickler)
except:
    import copyreg

    def Spectrum_unpickler(data, mask, data_folded, pop_ids):
        return moments.Spectrum(
            data,
            mask,
            mask_corners=False,
            data_folded=data_folded,
            check_folding=False,
            pop_ids=pop_ids,
        )

    def Spectrum_pickler(fs):
        return (
            Spectrum_unpickler,
            (fs.data, fs.mask, fs.folded, fs.pop_ids),
        )

    copyreg.pickle(Spectrum, Spectrum_pickler, Spectrum_unpickler)