Source code for moments.Spectrum_mod

"""
Contains Spectrum object, which represents frequency spectra.
"""
import logging

logging.basicConfig()
logger = logging.getLogger("Spectrum_mod")
import functools
import operator
import os
import sys
import numpy as np
from numpy import newaxis as nuax
import scipy.misc as misc
import scipy.stats
import copy
import itertools
import warnings
import demes

# Account for difference in scipy installations.
try:
    from scipy.misc import comb
except ImportError:
    try:
        from scipy.special import comb
    except:
        from scipy import comb
from scipy.integrate import trapz
from scipy.special import betainc

import moments.Integration
import moments.Integration_nomig
from . import Numerics
import moments.Demes as Demes

_plotting = True
try:
    import moments.ModelPlot
except ImportError:
    # if matplotlib is not present, do not import, and do not run plotting functions
    _plotting = False


[docs]class Spectrum(np.ma.masked_array): """ Represents a single-locus biallelic frequency spectrum. Spectra are represented by masked arrays. The masking allows us to ignore specific entries in the spectrum. When simulating under the standard infinite sites model (ISM), the entries we mask are the bins specifying absent or fixed variants. When using a reversible mutation model (i.e. the finite genome model), we track the density of variants in fixed bins, setting ``mask_corners`` to ``False``. :param data: An array with dimension equal to the number of populations. Each dimension has length :math:`n_i+1`, where :math:`n_i` is the sample size for the i-th population. :type data: array :param mask: An optional array of the same size as data. 'True' entries in this array are masked in the Spectrum. These represent missing data categories. (For example, you may not trust your singleton SNP calling.) :type maks: array, optional :param mask_corners: If True (default), the 'observed in none' and 'observed in all' entries of the FS will be masked. Typically these entries are masked. In the defaul infinite sites model, moments does not reliably calculate the fixed-bin entries, so you will almost always want ``mask_corners=True``. The exception is if we are simulating under the finite genome model, in which case we track the probability of a site to be fixed for either allele. :type maks_corners: bool, optional :param data_folded: If True, it is assumed that the input data is folded. An error will be raised if the input data and mask are not consistent with a folded Spectrum. :type data_folded: bool, optional :param check_folding: If True and data_folded=True, the data and mask will be checked to ensure they are consistent with a folded Spectrum. If they are not, a warning will be printed. :type check_folding: bool, optional :param pop_ids: Optional list of strings containing the population labels, with length equal to the dimension of ``data``. :type pop_ids: list of strings, optional :return: A frequency spectrum object, as a masked array. """ def __new__( subtype, data, mask=np.ma.nomask, mask_corners=True, data_folded=None, check_folding=True, dtype=float, copy=True, fill_value=np.nan, keep_mask=True, shrink=True, pop_ids=None, ): data = np.asanyarray(data) if mask is np.ma.nomask: mask = np.ma.make_mask_none(data.shape) else: # Since a mask is given, we do not mask the corners mask_corners = False subarr = np.ma.masked_array( data, mask=mask, dtype=dtype, copy=copy, fill_value=np.nan, keep_mask=True, shrink=True, ) subarr = subarr.view(subtype) if hasattr(data, "folded"): if data_folded is None or data_folded == data.folded: subarr.folded = data.folded elif data_folded != data.folded: raise ValueError( "Data does not have same folding status as " "was called for in Spectrum constructor." ) elif data_folded is not None: subarr.folded = data_folded else: subarr.folded = False # Check that if we're declaring that the input data is folded, it # actually is, and the mask reflects this. if data_folded: total_samples = np.sum(subarr.sample_sizes) total_per_entry = subarr._total_per_entry() # Which entries are nonsense in the folded fs. where_folded_out = total_per_entry > int(total_samples / 2) if check_folding and not np.all(subarr.data[where_folded_out] == 0): logger.warn( "Creating Spectrum with data_folded = True, but " "data has non-zero values in entries which are " "nonsensical for a folded Spectrum." ) if check_folding and not np.all(subarr.mask[where_folded_out]): logger.warn( "Creating Spectrum with data_folded = True, but " "mask is not True for all entries which are " "nonsensical for a folded Spectrum." ) if hasattr(data, "pop_ids"): if pop_ids is None or pop_ids == data.pop_ids: subarr.pop_ids = data.pop_ids elif pop_ids != data.pop_ids: logger.warn( "Changing population labels in construction of new " "Spectrum." ) if len(pop_ids) != subarr.ndim: raise ValueError( "pop_ids must be of length equal to " "dimensionality of Spectrum." ) subarr.pop_ids = pop_ids else: if pop_ids is not None and len(pop_ids) != subarr.ndim: raise ValueError( "pop_ids must be of length equal to " "dimensionality of Spectrum." ) subarr.pop_ids = pop_ids if mask_corners: subarr.mask_corners() return subarr # See http://www.scipy.org/Subclasses for information on the # __array_finalize__ and __array_wrap__ methods. I had to do some debugging # myself to discover that I also needed _update_from. # Also, see http://docs.scipy.org/doc/np/reference/arrays.classes.html # Also, see http://docs.scipy.org/doc/np/user/basics.subclassing.html # # We need these methods to ensure extra attributes get copied along when # we do arithmetic on the FS. def __array_finalize__(self, obj): if obj is None: return np.ma.masked_array.__array_finalize__(self, obj) self.folded = getattr(obj, "folded", "unspecified") self.pop_ids = getattr(obj, "pop_ids", None) def __array_wrap__(self, obj, context=None): result = obj.view(type(self)) result = np.ma.masked_array.__array_wrap__(self, obj, context=context) result.folded = self.folded result.pop_ids = self.pop_ids return result def _update_from(self, obj): np.ma.masked_array._update_from(self, obj) if hasattr(obj, "folded"): self.folded = obj.folded if hasattr(obj, "pop_ids"): self.pop_ids = obj.pop_ids # masked_array has priority 20. __array_priority__ = 20 def __repr__(self): return "Spectrum(%s, folded=%s, pop_ids=%s)" % ( str(self), str(self.folded), str(self.pop_ids), ) # Functions for manipulating frequency spectra.
[docs] def mask_corners(self): """ Mask the 'seen in 0 samples' and 'seen in all samples' entries. """ self.mask.flat[0] = self.mask.flat[-1] = True
[docs] def unmask_all(self): """ Unmask all entires of the frequency spectrum. """ self.mask[tuple([slice(None)] * self.Npop)] = False
def _get_sample_sizes(self): return np.asarray(self.shape) - 1 sample_sizes = property(_get_sample_sizes) def _get_Npop(self): return self.ndim Npop = property(_get_Npop) def _ensure_dimension(self, Npop): """ Ensure that fs has Npop dimensions. """ if not self.Npop == Npop: raise ValueError("Only compatible with %id spectra." % Npop)
[docs] def project(self, ns): """ Project to smaller sample size. ``project`` does *not* act in-place, so that the input frequency spectrum is not changed. :param ns: Sample sizes for new spectrum. :type ns: list of integers """ if len(ns) != self.Npop: raise ValueError( "Requested sample sizes not of same dimension " "as spectrum. Perhaps you need to marginalize " "over some populations first?" ) if np.any(np.asarray(ns) > np.asarray(self.sample_sizes)): raise ValueError( "Cannot project to a sample size greater than " "original. Original size is %s and requested size " "is %s." % (self.sample_sizes, ns) ) original_folded = self.folded # If we started with an folded Spectrum, we need to unfold before # projecting. if original_folded: output = self.unfold() else: output = self.copy() # Iterate over each axis, applying the projection. for axis, proj in enumerate(ns): if proj != self.sample_sizes[axis]: output = output._project_one_axis(proj, axis) output.pop_ids = self.pop_ids # Return folded or unfolded as original. if original_folded: return output.fold() else: return output
def _project_one_axis(self, n, axis=0): """ Project along a single axis. """ # This gets a little tricky with fancy indexing to make it work # for fs with arbitrary number of dimensions. if n > self.sample_sizes[axis]: raise ValueError( "Cannot project to a sample size greater than " "original. Called sizes were from %s to %s." % (self.sample_sizes[axis], n) ) newshape = list(self.shape) newshape[axis] = n + 1 # Create a new empty fs that we'll fill in below. pfs = Spectrum(np.zeros(newshape), mask_corners=False) # Set up for our fancy indexes. These slices are currently like # [:,:,...] from_slice = [slice(None) for ii in range(self.Npop)] to_slice = [slice(None) for ii in range(self.Npop)] proj_slice = [nuax for ii in range(self.Npop)] proj_from = self.sample_sizes[axis] # For each possible number of hits. for hits in range(proj_from + 1): # Adjust the slice in the array we're projecting from. from_slice[axis] = slice(hits, hits + 1) # These are the least and most possible hits we could have in the # projected fs. least, most = max(n - (proj_from - hits), 0), min(hits, n) to_slice[axis] = slice(least, most + 1) # The projection weights. proj = Numerics._cached_projection(n, proj_from, hits) proj_slice[axis] = slice(least, most + 1) # Do the multiplications pfs.data[tuple(to_slice)] += ( self.data[tuple(from_slice)] * proj[tuple(proj_slice)] ) pfs.mask[tuple(to_slice)] = np.logical_or( pfs.mask[tuple(to_slice)], self.mask[tuple(from_slice)] ) return pfs
[docs] def marginalize(self, over, mask_corners=None): """ Reduced dimensionality spectrum summing over the set of populations given by ``over``. ``marginalize`` does not act in-place, so the input frequency spectrum will not be altered. :param over: List of axes to sum over. For example (0,2) will marginalize populations 0 and 2. :type over: list of integers :param mask_corners: If True, the fixed bins of the resulting spectrum will be masked. The default behavior is to mask the corners only if at least one of the corners of the input frequency spectrum is masked. If either corner is masked, the output frequency spectrum masks the fixed bins. :type mask_corners: bool, optional """ if _plotting: # Update ModelPlot model = moments.ModelPlot._get_model() if model is not None: model.extinction(over) original_folded = self.folded # If we started with an folded Spectrum, we need to unfold before # marginalizing. if original_folded: output = self.unfold() else: output = self.copy() orig_mask = output.mask.copy() orig_mask.flat[0] = orig_mask.flat[-1] = False if np.any(orig_mask): logger.warn( "Marginalizing a Spectrum with internal masked values. " "This may not be a well-defined operation." ) # Do the marginalization for axis in sorted(over)[::-1]: output = output.sum(axis=axis) pop_ids = None if self.pop_ids is not None: pop_ids = list(self.pop_ids) for axis in sorted(over)[::-1]: del pop_ids[axis] output.folded = False output.pop_ids = pop_ids if mask_corners is None: if self.mask.flat[0] == True or self.mask.flat[-1] == True: mask_corners = True else: mask_corners = False if mask_corners: output.mask_corners() # Return folded or unfolded as original. if original_folded: return output.fold() else: return output
[docs] def swap_axes(self, ax1, ax2): """ Uses np's swapaxes function, but also swaps pop_ids as appropriate if pop_ids are given. .. note:: ``fs.swapaxes(ax1, ax2)`` will still work, but if population ids are given, it won't swap the pop_ids entries as expected. :param ax1: The index of the first population to swap. :type ax1: int :param ax2: The index of the second population to swap. :type ax2: int """ output = np.swapaxes(self, ax1, ax2) if output.pop_ids is not None: pop1, pop2 = output.pop_ids[ax1], output.pop_ids[ax2] output.pop_ids[ax1], output.pop_ids[ax2] = pop2, pop1 return output
def _counts_per_entry(self): """ Counts per population for each entry in the fs. """ ind = np.indices(self.shape) # Transpose the first access to the last, so ind[ii,jj,kk] = [ii,jj,kk] ind = ind.transpose(list(range(1, self.Npop + 1)) + [0]) return ind def _total_per_entry(self): """ Total derived alleles for each entry in the fs. """ return np.sum(self._counts_per_entry(), axis=-1)
[docs] def log(self): """ Returns the natural logarithm of the entries of the frequency spectrum. Only necessary because np.ma.log now fails to propagate extra attributes after np 1.10. """ logfs = np.ma.log(self) logfs.folded = self.folded logfs.pop_ids = self.pop_ids return logfs
[docs] def fold(self): """ Returns a folded frequency spectrum. The folded fs assumes that information on which allele is ancestral or derived is unavailable. Thus the fs is in terms of minor allele frequency. This makes the fs into a "triangular" array. If a masked cell is folded into non-masked cell, the destination cell is masked as well. Folding is not done in-place. The return value is a new Spectrum object. """ if self.folded: raise ValueError("Input Spectrum is already folded.") # How many samples total do we have? The folded fs can only contain # entries up to total_samples/2 (rounded down). total_samples = np.sum(self.sample_sizes) total_per_entry = self._total_per_entry() # Here's where we calculate which entries are nonsense in the folded fs. where_folded_out = total_per_entry > int(total_samples / 2) original_mask = self.mask # Here we create a mask that masks any values that were masked in # the original fs (or folded onto by a masked value). final_mask = np.logical_or(original_mask, Numerics.reverse_array(original_mask)) # To do the actual folding, we take those entries that would be folded # out, reverse the array along all axes, and add them back to the # original fs. reversed = Numerics.reverse_array(np.where(where_folded_out, self, 0)) folded = np.ma.masked_array(self.data + reversed) folded.data[where_folded_out] = 0 # Deal with those entries where assignment of the minor allele is # ambiguous. where_ambiguous = total_per_entry == total_samples / 2.0 ambiguous = np.where(where_ambiguous, self, 0) folded += -0.5 * ambiguous + 0.5 * Numerics.reverse_array(ambiguous) # Mask out the remains of the folding operation. final_mask = np.logical_or(final_mask, where_folded_out) outfs = Spectrum( folded, mask=final_mask, data_folded=True, pop_ids=self.pop_ids ) return outfs
[docs] def unfold(self): """ Returns an unfolded frequency spectrum. It is assumed that each state of a SNP is equally likely to be ancestral. Unfolding is not done in-place. The return value is a new Spectrum object. """ if not self.folded: raise ValueError("Input Spectrum is not folded.") # Unfolding the data is easy. reversed_data = Numerics.reverse_array(self.data) newdata = (self.data + reversed_data) / 2.0 # Unfolding the mask is trickier. We want to preserve masking of entries # that were masked in the original Spectrum. # Which entries in the original Spectrum were masked solely because # they are incompatible with a folded Spectrum? total_samples = np.sum(self.sample_sizes) total_per_entry = self._total_per_entry() where_folded_out = total_per_entry > int(total_samples / 2) newmask = np.logical_xor(self.mask, where_folded_out) newmask = np.logical_or(newmask, Numerics.reverse_array(newmask)) outfs = Spectrum(newdata, mask=newmask, data_folded=False, pop_ids=self.pop_ids) return outfs
# Functions that apply demographic events, including integration.
[docs] def split(self, idx, n0, n1, new_ids=None): """ Splits a population in the SFS into two populations, with the extra population placed at the end. Returns a new frequency spectrum. :param idx: The index of the population to split. :type idx: int :param n0: The sample size of the first split population. :type n0: int :param n1: The sample size of the second split population. :type n1: int :param new_ids: The population IDs of the split populations. Can only be used if pop_ids are given for the input spectrum. :type new_ids: list of strings, optional """ if self.folded: raise ValueError("Cannot perform split on folded spectrum.") if self.pop_ids is None and new_ids is not None: raise ValueError("Trying to assign ids to a SFS with no pop_ids.") if new_ids is not None and len(new_ids) != 2: raise ValueError("new_ids must be a list of two population id strings") fs_split = moments.Manips.split_by_index(self, idx, n0, n1) if new_ids is not None: fs_split.pop_ids = self.pop_ids fs_split.pop_ids[idx] = new_ids[0] fs_split.pop_ids.append(new_ids[1]) return fs_split
[docs] def branch(self, idx, n, new_id=None): """ A "branch" event, where a population gives rise to a child population, while persisting. This is conceptually similar to the split event. The number of lineages in the new population is provided, and the number of lineages in the source/parental population is the original sample size minus the number requested for the branched population. Returns a new frequency spectrum. :param idx: The index of the population to branch. :type idx: int :param n: The sample size of the new population. :type n: int :param new_id: The population ID of the branch populations. The parental population retains its original population ID. Can only be used if pop_ids are given for the input spectrum. :type new_ids: str, optional """ if self.folded: raise ValueError("Cannot perform branch on folded spectrum.") if self.pop_ids is None and new_id is not None: raise ValueError("Trying to assign ids to a SFS with no pop_ids.") if idx < 0 or idx >= self.Npop: raise ValueError( f"Cannot branch population index {idx} in SFS of size {self.Npop}" ) n_parent = self.sample_sizes[idx] n0 = n_parent - n fs_branch = moments.Manips.split_by_index(self, idx, n0, n) if new_id is not None: fs_branch.pop_ids = self.pop_ids fs_branch.pop_ids.append(new_id) return fs_branch
[docs] def admix(self, idx0, idx1, num_lineages, proportion, new_id=None): """ Returns a new frequency spectrum with an admixed population that arose through admixture from indexed populations with given number of lineages and proportions from parental populations. This serves as a wrapper for ``Manips.admix_into_new``, with the added feature of handling pop_ids. If the number of lineages that move are equal to the number of lineages previously present in a source population, that source population is marginalized. :param idx0: Index of first source population. :type idx0: int :param idx1: Index of second source population. :type idx1: int :param num_lineages: Number of lineages in the new population. Cannot be greater than the number of existing lineages in either source populations. :type num_lineages: int :param proportion: The proportion of lineages that come from the first source population (1-proportion acestry comes from the second source population). Must be a number between 0 and 1. :type proportion: float :param new_id: The ID of the new population. Can only be used if the population IDs are specified in the input SFS. :type new_id: str, optional """ if new_id is not None and self.pop_ids is None: raise ValueError("Cannot specify new pop ids if input SFS has no pop_ids") if proportion < 0 or proportion > 1: raise ValueError("proportion must be between 0 and 1") if idx0 == idx1: raise ValueError("Cannot admix population with itself") if idx0 < 0 or idx0 >= self.Npop or idx1 < 0 or idx1 >= self.Npop: raise ValueError(f"Population indexes must be between 0 and {self.Npop-1}") fs_admix = moments.Manips.admix_into_new( self, idx0, idx1, num_lineages, proportion ) if new_id is not None: new_pop_ids = copy.copy(self.pop_ids) # remove pop ids for marginalized pops for idx in sorted([idx0, idx1])[::-1]: if self.sample_sizes[idx] == num_lineages: del new_pop_ids[idx] new_pop_ids.append(new_id) fs_admix.pop_ids = new_pop_ids return fs_admix
[docs] def pulse_migrate(self, idx_from, idx_to, keep_from, proportion): """ Mass migration (pulse admixture) between two existing populations. The target (destination) population has the same number of lineages in the output SFS, and the source population has ``keep_from`` number of lineages after the pulse event. The proportion is the expected ancestry proportion in the target population that comes from the source population. This serves as a wrapper for ``Manips.admix_inplace``. Depending on the proportion and number of lineages, because this is an approximate operation, we often need a large number of lineages from the source population to maintain accuracy. :param idx_from: Index of source population. :type idx_from: int :param idx_to: Index of targeet population. :type idx_to: int :param keep_from: Number of lineages to keep in source population. :type keep_from: int :param proportion: Ancestry proportion of source population that migrates to target population. :type proportion: float """ if idx_from < 0 or idx_from >= self.Npop or idx_to < 0 or idx_to >= self.Npop: raise ValueError(f"Invalid population index for {self.Npop}D SFS.") if proportion < 0 or proportion > 1: raise ValueError("proportion must be between 0 and 1") if idx_from == idx_to: raise ValueError("Cannot admix population into itself") fs_pulse = moments.Manips.admix_inplace( self, idx_from, idx_to, keep_from, proportion ) if self.pop_ids is not None: fs_pulse.pop_ids = self.pop_ids return fs_pulse
# Integrate the SFS in-place
[docs] def integrate( self, Npop, tf, dt_fac=0.02, gamma=None, h=None, m=None, theta=1.0, adapt_dt=False, finite_genome=False, theta_fd=None, theta_bd=None, frozen=[False], ): """ Method to simulate the spectrum's evolution for a given set of demographic parameters. The SFS is integrated forward-in-time, and the integration occurs in-place, meaning you need only call ``fs.integrate( )``, and the ``fs`` is updated. :param Npop: List of populations' relative effective sizes. Can be given as a list of positive values for constant sizes, or as a function that returns a list of sizes at a given time. :type Npop: list or function that returns a list :param tf: The total integration time in genetic units. :type tf: float :param dt_fac: The timestep factor, default is 0.02. This parameter typically does not need to be adjusted. :type dt_fac: float, optional :param gamma: The selection coefficient (:math:`2 N_e s`), or list of selection coefficients if more than one population. :type gamma: float or list of floats, optional :param h: The dominance coefficient, or list of dominance coefficients in each population, if more than one population. :type h: float or list of floats, optional :param m: The migration rates matrix as a 2-D array with shape nxn, where n is the number of populations. The entry of the migration matrix m[i,j] is the migration rate from pop j to pop i in genetic units, that is, normalized by :math:`2N_e`. `m` may be either a 2-D array, or a function that returns a 2-D array (with dimensions equal to (num pops)x(num pops)). :type m: array-like, optional :param theta: The scaled mutation rate :math:`4 N_e u`, which defaults to 1. ``theta`` can be used in the reversible model in the case of symmetric mutation rates. In this case, ``theta`` must be set to << 1. :type theta: float, optional :param adapt_dt: flag to allow dt correction avoiding negative entries. :type adapt_dt: bool, optional :param finite_genome: If True, simulate under the finite-genome model with reversible mutations. If using this model, we can specify the forward and backward mutation rates, which are per-base rates that are not scaled by number of mutable loci. If ``theta_fd`` and ``theta_bd`` are not specified, we assume equal forward and backward mutation rates provided by ``theta``, which must be set to less that 1. Defaults to False. :type finite_genome: bool, optional :param theta_fd: The forward mutation rate :math:`4 Ne u`. :type theta_fd: float, optional :param theta_bd: The backward mutation rate :math:`4 Ne v`. :type theta_bd: float, optional :param frozen: Specifies the populations that are "frozen", meaning samples from that population no longer change due or contribute to migration to other populations. This feature is most often used to indicate ancient samples, for example, ancient DNA. The ``frozen`` parameter is given as a list of same length as number of pops, with ``True`` for frozen populations at the corresponding index, and ``False`` for populations that continue to evolve. :type frozen: list of bools """ if tf < 0: raise ValueError("Integration time cannot be negative") n = np.array(self.shape) - 1 if m is not None and not callable(m): m = np.array(m) if finite_genome == True and (theta_fd == None or theta_bd == None): if theta >= 1: raise ValueError( "In the finite genome model, theta must be much less than 1. " "If symmetric mutation rates, can use theta << 1. Otherwise, " "theta_fd and theta_bd must be specified." ) else: theta_fd = theta_bd = theta if hasattr(Npop, "__len__"): if np.any(frozen) and len(Npop) != len(frozen): raise ValueError( "If one or more populations are frozen, length " "of frozen must match number of simulated pops." ) else: if np.any(frozen) and len(Npop(0)) != len(frozen): raise ValueError( "If one or more populations are frozen, length " "of frozen must match number of simulated pops." ) if _plotting: model = moments.ModelPlot._get_model() if model is not None: model.evolve(tf, Npop, m) if len(n) == 1: if gamma is None: gamma = 0.0 if h is None: h = 0.5 if gamma == 0: self.data[:] = moments.Integration_nomig.integrate_neutral( self.data, Npop, tf, dt_fac, theta, finite_genome=finite_genome, theta_fd=theta_fd, theta_bd=theta_bd, frozen=frozen, ) else: self.data[:] = moments.Integration_nomig.integrate_nomig( self.data, Npop, tf, dt_fac, gamma, h, theta, finite_genome=finite_genome, theta_fd=theta_fd, theta_bd=theta_bd, frozen=frozen, ) else: if gamma is None: gamma = np.zeros(len(n)) elif not hasattr(gamma, "__len__"): gamma = gamma * np.ones(len(n)) if h is None: h = 0.5 * np.ones(len(n)) elif not hasattr(h, "__len__"): h = h * np.ones(len(n)) if m is None: m = np.zeros([len(n), len(n)]) if not callable(m) and (m == 0).all(): # for more than 2 populations, the sparse solver # seems to be faster than the tridiag... if (np.array(gamma) == 0).all() and len(n) < 3: self.data[:] = moments.Integration_nomig.integrate_neutral( self.data, Npop, tf, dt_fac, theta, finite_genome=finite_genome, theta_fd=theta_fd, theta_bd=theta_bd, frozen=frozen, ) else: self.data[:] = moments.Integration_nomig.integrate_nomig( self.data, Npop, tf, dt_fac, gamma, h, theta, finite_genome=finite_genome, theta_fd=theta_fd, theta_bd=theta_bd, frozen=frozen, ) else: self.data[:] = moments.Integration.integrate_nD( self.data, Npop, tf, dt_fac, gamma, h, m, theta, adapt_dt, finite_genome=finite_genome, theta_fd=theta_fd, theta_bd=theta_bd, frozen=frozen, )
# Functions for computing statistics from frequency spetra.
[docs] def Fst(self, pairwise=False): """ Wright's Fst between the populations represented in the fs. This estimate of Fst assumes random mating, because we don't have heterozygote frequencies in the fs. Calculation is by the method of Weir and Cockerham *Evolution* 38:1358 (1984). For a single SNP, the relevant formula is at the top of page 1363. To combine results between SNPs, we use the weighted average indicated by equation 10. :param pairwise: Defaults to False. If True, returns a dictionary of all pairwise Fst within the multi-dimensional spectrum. :type pairwise: bool """ if pairwise: Fsts = {} for pair in itertools.combinations(range(self.ndim), 2): to_marginalize = list(range(self.ndim)) to_marginalize.pop(to_marginalize.index(pair[0])) to_marginalize.pop(to_marginalize.index(pair[1])) fs_marg = self.marginalize(to_marginalize) if self.pop_ids is not None: pair_key = (self.pop_ids[pair[0]], self.pop_ids[pair[1]]) else: pair_key = pair Fsts[pair_key] = fs_marg.Fst() return Fsts else: # This gets a little obscure because we want to be able to work with # spectra of arbitrary dimension. # First quantities from page 1360 r = self.Npop ns = self.sample_sizes nbar = np.mean(ns) nsum = np.sum(ns) nc = (nsum - np.sum(ns ** 2) / nsum) / (r - 1) # counts_per_pop is an r+1 dimensional array, where the last axis simply # records the indices of the entry. # For example, counts_per_pop[4,19,8] = [4,19,8] counts_per_pop = np.indices(self.shape) counts_per_pop = np.transpose( counts_per_pop, axes=list(range(1, r + 1)) + [0] ) # The last axis of ptwiddle is now the relative frequency of SNPs in # that bin in each of the populations. ptwiddle = 1.0 * counts_per_pop / ns # Note that pbar is of the same shape as fs... pbar = np.sum(ns * ptwiddle, axis=-1) / nsum # We need to use 'this_slice' to get the proper aligment between # ptwiddle and pbar. this_slice = [slice(None)] * r + [np.newaxis] s2 = np.sum(ns * (ptwiddle - pbar[tuple(this_slice)]) ** 2, axis=-1) / ( (r - 1) * nbar ) # Note that this 'a' differs from equation 2, because we've used # equation 3 and b = 0 to solve for hbar. a = ( nbar / nc * (s2 - 1 / (2 * nbar - 1) * (pbar * (1 - pbar) - (r - 1) / r * s2)) ) d = 2 * nbar / (2 * nbar - 1) * (pbar * (1 - pbar) - (r - 1) / r * s2) # The weighted sum over loci. asum = (self * a).sum() dsum = (self * d).sum() return asum / (asum + dsum)
[docs] def S(self): """ Returns the number of segregating sites in the frequency spectrum. """ oldmask = self.mask.copy() self.mask_corners() S = self.sum() self.mask = oldmask return S
[docs] def Watterson_theta(self): """ Returns Watterson's estimator of theta. .. note:: This function is only sensible for 1-dimensional spectra. """ if self.Npop != 1: raise ValueError("Only defined on a one-dimensional fs.") n = self.sample_sizes[0] S = self.S() an = np.sum(1.0 / np.arange(1, n)) return S / an
[docs] def theta_L(self): """ Returns theta_L as defined by Zeng et al. "Statistical Tests for Detecting Positive Selection by Utilizing High-Frequency Variants" (2006) Genetics .. note:: This function is only sensible for 1-dimensional spectra. """ if self.Npop != 1: raise ValueError("Only defined on a one-dimensional fs.") n = self.sample_sizes[0] return np.sum(np.arange(1, n) * self[1:n]) / (n - 1)
[docs] def Zengs_E(self): """ Returns Zeng et al.'s E statistic. From Zeng et al., "Statistical Tests for Detecting Positive Selection by Utilizing High-Frequency Variants." *Genetics*, 2016. """ num = self.theta_L() - self.Watterson_theta() n = self.sample_sizes[0] # See after Eq. 3 an = np.sum(1.0 / np.arange(1, n)) # See after Eq. 9 bn = np.sum(1.0 / np.arange(1, n) ** 2) s = self.S() # See immediately after Eq. 12 theta = self.Watterson_theta() theta_sq = s * (s - 1.0) / (an ** 2 + bn) # Eq. 14 var = (n / (2.0 * (n - 1.0)) - 1.0 / an) * theta + ( bn / an ** 2 + 2.0 * (n / (n - 1.0)) ** 2 * bn - 2 * (n * bn - n + 1.0) / ((n - 1.0) * an) - (3.0 * n + 1.0) / (n - 1.0) ) * theta_sq return num / np.sqrt(var)
[docs] def pi(self): """ Returns the estimated expected number of pairwise differences between two chromosomes in the population. .. note:: This estimate includes a factor of sample_size / (sample_size - 1) to make :math:`\\mathbb{E}[\\pi] = \\theta`. """ if self.ndim != 1: raise ValueError("Only defined for a one-dimensional SFS.") n = self.sample_sizes[0] # sample frequencies p p = np.arange(0, n + 1, dtype=float) / n # This expression derives from Gillespie's _Population_Genetics:_A # _Concise_Guide_, 2nd edition, section 2.6. return n / (n - 1.0) * 2 * np.ma.sum(self * p * (1 - p))
[docs] def Tajima_D(self): """ Returns Tajima's D. Following Gillespie "Population Genetics: A Concise Guide" pg. 45 """ if not self.Npop == 1: raise ValueError("Only defined on a one-dimensional SFS.") S = self.S() n = 1.0 * self.sample_sizes[0] pihat = self.pi() theta = self.Watterson_theta() a1 = np.sum(1.0 / np.arange(1, n)) a2 = np.sum(1.0 / np.arange(1, n) ** 2) b1 = (n + 1) / (3 * (n - 1)) b2 = 2 * (n ** 2 + n + 3) / (9 * n * (n - 1)) c1 = b1 - 1.0 / a1 c2 = b2 - (n + 2) / (a1 * n) + a2 / a1 ** 2 C = np.sqrt((c1 / a1) * S + c2 / (a1 ** 2 + a2) * S * (S - 1)) return (pihat - theta) / C
# Functions for resampling or generating "data" from an SFS
[docs] def fixed_size_sample(self, nsamples, include_masked=False): """ Generate a resampled SFS from the current one. Thus, the resampled SFS follows a multinomial distribution given by the proportion of sites in each bin in the original SFS. :param nsamples: Number of samples to include in the new SFS. :type nsamples: int :param include_masked: If True, use all bins from the SFS. Otherwise, use only non-masked bins. Defaults to False. :type include_masked: bool, optional """ flat = self.flatten() pvals = flat.data if include_masked is False: pvals[flat.mask] = 0 pvals /= pvals.sum() sample = np.random.multinomial(int(nsamples), pvals) sample = sample.reshape(self.shape) sample = moments.Spectrum(sample, mask=self.mask, pop_ids=self.pop_ids) sample = sample.astype(int) return sample
[docs] def sample(self): """ Generate a Poisson-sampled fs from the current one. Entries where the current fs is masked or 0 will be masked in the output sampled fs. """ # These are entries where the sampling has no meaning. Either the fs is # 0 there or masked. bad_entries = np.logical_or(self == 0, self.mask) # We convert to a 1-d array for passing into the sampler means = self.ravel().copy() # Filter out those bad entries. means[bad_entries.ravel()] = 1 # Sample samp = scipy.stats.distributions.poisson.rvs(means, size=len(means)) # Replace bad entries... samp[bad_entries.ravel()] = 0 # Convert back to a properly shaped array samp = samp.reshape(self.shape) # Convert to a fs and mask the bad entries samp = Spectrum( samp, mask=bad_entries, data_folded=self.folded, pop_ids=self.pop_ids ) samp = samp.astype(int) return samp
[docs] def genotype_matrix( self, num_sites=None, sample_sizes=None, diploid_genotypes=False ): """ Generate a genotype matrix of independent loci. For multi-population spectra, the individual columns are filled in the sample order as the populations in the SFS. .. note:: Sites in the output genotype matrix are necessarily separated by infinite recombination. The SFS assumes all loci are segregating independently, so there is no linkage between them. Returns a genotype matrix of size number of sites by total sample size. :param num_sites: Defaults to None, in which case we take a poisson sample from the SFS. Otherwise, we take a fixed number of sites. :param sample_sizes: The sample size in each population, as a list with length of the number of dimension (populations) in the SFS. :diploid_genotypes: Defaults to False, in which case we return a haplotype matrix of size (num_sites x sum(sample_sizes)). If True, we return a diploid genotype matrix (filled with 0, 1, 2) of size (num_sites x sum(sample_sizes)/2). """ # project to sample size as needed if sample_sizes is None: sample_sizes = self.sample_sizes proj = self.project(sample_sizes) if diploid_genotypes: for ss in proj.sample_sizes: if ss % 2 != 0: raise ValueError( "Requested diploid genotypes, but the haploid sample " f"sizes ({sample_sizes}) are not all even." ) # get sampled SFS if num_sites is None: samp = proj.sample() num_sites = samp.sum() else: samp = proj.fixed_size_sample(num_sites) assert num_sites == samp.S() # remove once tested # fill in genotype matrix G = np.zeros((num_sites, sum(proj.sample_sizes)), dtype=int) i = 0 for idx in np.transpose(np.nonzero(samp)): for _ in range(samp[tuple(idx)]): g = [] for j, n in zip(idx, sample_sizes): g += list(np.random.permutation([1] * j + [0] * (n - j))) G[i] = g i += 1 # collapse diploids if needed hap_sum = G.sum(axis=1) if diploid_genotypes: G = G[:, ::2] + G[:, 1::2] assert np.all(G.sum(axis=1) == hap_sum) # remove once tested # shuffle genotype matrix np.random.shuffle(G) return G
# Functions for saving and loading frequency spectra.
[docs] @staticmethod def from_file(fid, mask_corners=True, return_comments=False): """ Read frequency spectrum from file. See ``to_file`` for details on the file format. :param fid: string with file name to read from or an open file object. :type fid: string :param mask_corners: If True, mask the 'absent in all samples' and 'fixed in all samples' entries. :type mask_corners: bool, optional :param return_comments: If true, the return value is (fs, comments), where comments is a list of strings containing the comments from the file (without #'s). :type return_comments: bool, optional """ newfile = False # Try to read from fid. If we can't, assume it's something that we can # use to open a file. if not hasattr(fid, "read"): newfile = True fid = open(fid, "r") line = fid.readline() # Strip out the comments comments = [] while line.startswith("#"): comments.append(line[1:].strip()) line = fid.readline() # Read the shape of the data shape_spl = line.split() if "folded" not in shape_spl and "unfolded" not in shape_spl: # This case handles the old file format shape = tuple([int(d) for d in shape_spl]) folded = False pop_ids = None else: # This case handles the new file format shape, next_ii = [int(shape_spl[0])], 1 while shape_spl[next_ii] not in ["folded", "unfolded"]: shape.append(int(shape_spl[next_ii])) next_ii += 1 folded = shape_spl[next_ii] == "folded" # Are there population labels in the file? if len(shape_spl) > next_ii + 1: pop_ids = line.split('"')[1::2] else: pop_ids = None data = np.fromstring(fid.readline().strip(), count=np.product(shape), sep=" ") # fromfile returns a 1-d array. Reshape it to the proper form. data = data.reshape(*shape) maskline = fid.readline().strip() if not maskline: # The old file format didn't have a line for the mask mask = np.ma.nomask else: # This case handles the new file format mask = np.fromstring(maskline, count=np.product(shape), sep=" ") mask = mask.reshape(*shape) # If we opened a new file, clean it up. if newfile: fid.close() fs = Spectrum(data, mask, mask_corners, data_folded=folded, pop_ids=pop_ids) if not return_comments: return fs else: return fs, comments
fromfile = from_file
[docs] def to_file(self, fid, precision=16, comment_lines=[], foldmaskinfo=True): """ Write frequency spectrum to file. The file format is: - Any number of comment lines beginning with a '#' - A single line containing N integers giving the dimensions of the fs array. So this line would be '5 5 3' for an SFS that was 5x5x3. (That would be 4x4x2 *samples*.) - On the *same line*, the string 'folded' or 'unfolded' denoting the folding status of the array - On the *same line*, optional strings each containing the population labels in quotes separated by spaces, e.g. "pop 1" "pop 2" - A single line giving the array elements. The order of elements is e.g.: fs[0,0,0] fs[0,0,1] fs[0,0,2] ... fs[0,1,0] fs[0,1,1] ... - A single line giving the elements of the mask in the same order as the data line. '1' indicates masked, '0' indicates unmasked. :param fid: string with file name to write to or an open file object. :type fid: string :param precision: precision with which to write out entries of the SFS. (They are formated via %.<p>g, where <p> is the precision.) Defaults to 16. :type precision: int, optional :param comment_lines: list of strings to be used as comment lines in the header of the output file. :type comment_lines: list of strings, optional :param foldmaskinfo: If False, folding and mask and population label information will not be saved. :type foldmaskinfo: bool, optional """ # Open the file object. newfile = False if not hasattr(fid, "write"): newfile = True fid = open(fid, "w") # Write comments for line in comment_lines: fid.write("# ") fid.write(line.strip()) fid.write(os.linesep) # Write out the shape of the fs for elem in self.data.shape: fid.write("%i " % elem) if foldmaskinfo: if not self.folded: fid.write("unfolded") else: fid.write("folded") if self.pop_ids is not None: for label in self.pop_ids: fid.write(' "%s"' % label) fid.write(os.linesep) # Write the data to the file self.data.tofile(fid, " ", "%%.%ig" % precision) fid.write(os.linesep) if foldmaskinfo: # Write the mask to the file np.asarray(self.mask, int).tofile(fid, " ") fid.write(os.linesep) # Close file if newfile: fid.close()
## Overide the (perhaps confusing) original np tofile method. tofile = to_file
[docs] @staticmethod def from_ms_file( fid, average=True, mask_corners=True, return_header=False, pop_assignments=None, pop_ids=None, bootstrap_segments=1, ): """ Read frequency spectrum from file of ms output. :param fid: string with file name to read from or an open file object. :param average: If True, the returned fs is the average over the runs in the ms file. If False, the returned fs is the sum. :param mask_corners: If True, mask the 'absent in all samples' and 'fixed in all samples' entries. :param return_header: If True, the return value is (fs, (command,seeds), where command and seeds are strings containing the ms commandline and the seeds used. :param pop_assignments: If None, the assignments of samples to populations is done automatically, using the assignment in the ms command line. To manually assign populations, pass a list of the from [6,8]. This example places the first 6 samples into population 1, and the next 8 into population 2. :param pop_ids: Optional list of strings containing the population labels. If pop_ids is None, labels will be "pop0", "pop1", ... :param bootstrap_segments: If bootstrap_segments is an integer greater than 1, the data will be broken up into that many segments based on SNP position. Instead of single FS, a list of spectra will be returned, one for each segment. """ warnings.warn( "Spectrum.from_ms_file() is deprecated and will be removed in version 1.2", warnings.DeprecationWarning, ) newfile = False # Try to read from fid. If we can't, assume it's something that we can # use to open a file. if not hasattr(fid, "read"): newfile = True fid = open(fid, "r") # Parse the commandline command = line = fid.readline() command_terms = line.split() if command_terms[0].count("ms"): runs = int(command_terms[2]) try: pop_flag = command_terms.index("-I") num_pops = int(command_terms[pop_flag + 1]) pop_samples = [ int(command_terms[pop_flag + ii]) for ii in range(2, 2 + num_pops) ] except ValueError: num_pops = 1 pop_samples = [int(command_terms[1])] else: raise ValueError("Unrecognized command string: %s." % command) total_samples = np.sum(pop_samples) if pop_assignments: num_pops = len(pop_assignments) pop_samples = pop_assignments sample_indices = np.cumsum([0] + pop_samples) bottom_l = sample_indices[:-1] top_l = sample_indices[1:] seeds = line = fid.readline() while not line.startswith("//"): line = fid.readline() counts = np.zeros(len(pop_samples), np.int_) fs_shape = np.asarray(pop_samples) + 1 dimension = len(counts) if dimension > 1: bottom0 = bottom_l[0] top0 = top_l[0] bottom1 = bottom_l[1] top1 = top_l[1] if dimension > 2: bottom2 = bottom_l[2] top2 = top_l[2] if dimension > 3: bottom3 = bottom_l[3] top3 = top_l[3] if dimension > 4: bottom4 = bottom_l[4] top4 = top_l[4] if dimension > 5: bottom5 = bottom_l[5] top5 = top_l[5] all_data = [ np.zeros(fs_shape, np.int_) for boot_ii in range(bootstrap_segments) ] for run_ii in range(runs): line = fid.readline() segsites = int(line.split()[-1]) if segsites == 0: # Special case, need to read 3 lines to stay synced. for _ in range(3): line = fid.readline() continue line = fid.readline() while not line.startswith("positions"): line = fid.readline() # Read SNP positions for creating bootstrap segments positions = [float(_) for _ in line.split()[1:]] # Where we should break our interval to create our bootstraps breakpts = np.linspace(0, 1, bootstrap_segments + 1) # The indices that correspond to those breakpoints break_iis = np.searchsorted(positions, breakpts) # Correct for searchsorted behavior if last position is 1, # to ensure all SNPs are captured break_iis[-1] = len(positions) # Read the chromosomes in chromos = fid.read((segsites + 1) * total_samples) # For each bootstrap segment, relevant SNPs run from start_ii:end_ii for boot_ii, (start_ii, end_ii) in enumerate( zip(break_iis[:-1], break_iis[1:]) ): # Use the data array corresponding to this bootstrap segment data = all_data[boot_ii] for snp in range(start_ii, end_ii): # Slice to get all the entries that refer to a given SNP this_snp = chromos[snp :: segsites + 1] # Count SNPs per population, and record them. if dimension == 1: data[this_snp.count("1")] += 1 elif dimension == 2: data[ this_snp[bottom0:top0].count("1"), this_snp[bottom1:top1].count("1"), ] += 1 elif dimension == 3: data[ this_snp[bottom0:top0].count("1"), this_snp[bottom1:top1].count("1"), this_snp[bottom2:top2].count("1"), ] += 1 elif dimension == 4: data[ this_snp[bottom0:top0].count("1"), this_snp[bottom1:top1].count("1"), this_snp[bottom2:top2].count("1"), this_snp[bottom3:top3].count("1"), ] += 1 elif dimension == 5: data[ this_snp[bottom0:top0].count("1"), this_snp[bottom1:top1].count("1"), this_snp[bottom2:top2].count("1"), this_snp[bottom3:top3].count("1"), this_snp[bottom4:top4].count("1"), ] += 1 elif dimension == 6: data[ this_snp[bottom0:top0].count("1"), this_snp[bottom1:top1].count("1"), this_snp[bottom2:top2].count("1"), this_snp[bottom3:top3].count("1"), this_snp[bottom4:top4].count("1"), this_snp[bottom5:top5].count("1"), ] += 1 else: # This is noticably slower, so we special case the cases # above. for dim_ii in range(dimension): bottom = bottom_l[dim_ii] top = top_l[dim_ii] counts[dim_ii] = this_snp[bottom:top].count("1") data[tuple(counts)] += 1 # Read to the next iteration line = fid.readline() line = fid.readline() if newfile: fid.close() all_fs = [ Spectrum(data, mask_corners=mask_corners, pop_ids=pop_ids) for data in all_data ] if average: all_fs = [fs / runs for fs in all_fs] # If we aren't setting up for bootstrapping, return fs, rather than a # list of length 1. (This ensures backward compatibility.) if bootstrap_segments == 1: all_fs = all_fs[0] if not return_header: return all_fs else: return all_fs, (command, seeds)
@staticmethod def _from_sfscode_file( fid, sites="all", average=True, mask_corners=True, return_header=False, pop_ids=None, ): """ Read frequency spectrum from file of sfs_code output. :param fid: string with file name to read from or an open file object. :param sites: If sites=='all', return the fs of all sites. If sites == 'syn', use only synonymous mutations. If sites == 'nonsyn', use only non-synonymous mutations. :param average: If True, the returned fs is the average over the runs in the file. If False, the returned fs is the sum. :param mask_corners: If True, mask the 'absent in all samples' and 'fixed in all samples' entries. :param return_header: If true, the return value is (fs, (command,seeds), where command and seeds are strings containing the ms commandline and the seeds used. :param pop_ids: Optional list of strings containing the population labels. If pop_ids is None, labels will be "pop0", "pop1", ... """ warnings.warn( "Spectrum.from_sfscode_file() is deprecated and will be removed in ver 1.2", warnings.DeprecationWarning, ) newfile = False # Try to read from fid. If we can't, assume it's something that we can # use to open a file. if not hasattr(fid, "read"): newfile = True fid = open(fid, "r") if sites == "all": only_nonsyn, only_syn = False, False elif sites == "syn": only_nonsyn, only_syn = False, True elif sites == "nonsyn": only_nonsyn, only_syn = True, False else: raise ValueError( "'sites' argument must be one of ('all', 'syn', " "'nonsyn')." ) command = fid.readline() command_terms = command.split() runs = int(command_terms[2]) num_pops = int(command_terms[1]) # sfs_code default is 6 individuals, and I assume diploid pop pop_samples = [12] * num_pops if "--sampSize" in command_terms or "-n" in command_terms: try: pop_flag = command_terms.index("--sampSize") pop_flag = command_terms.index("-n") except ValueError: pass pop_samples = [ 2 * int(command_terms[pop_flag + ii]) for ii in range(1, 1 + num_pops) ] pop_samples = np.asarray(pop_samples) pop_digits = [str(i) for i in range(num_pops)] pop_fixed_str = [",%s.-1" % i for i in range(num_pops)] pop_count_str = [",%s." % i for i in range(num_pops)] seeds = fid.readline() line = fid.readline() data = np.zeros(np.asarray(pop_samples) + 1, np.int_) # line = //iteration... line = fid.readline() for iter_ii in range(runs): for ii in range(5): line = fid.readline() # It is possible for a mutation to be listed several times in the # output. To accomodate this, I keep a dictionary of identities # for those mutations, and hold off processing them until I've seen # all mutations listed for the iteration. mut_dict = {} # Loop until this iteration ends. while not line.startswith("//") and line != "": split_line = line.split(";") if split_line[-1] == "\n": split_line = split_line[:-1] # Loop over mutations on this line. for mut_ii, mutation in enumerate(split_line): counts_this_mut = np.zeros(num_pops, np.int_) split_mut = mutation.split(",") # Exclude synonymous mutations if only_nonsyn and split_mut[7] == "0": continue # Exclude nonsynonymous mutations if only_syn and split_mut[7] == "1": continue ind_start = len(",".join(split_mut[:12])) by_individual = mutation[ind_start:] mut_id = ",".join(split_mut[:4] + split_mut[5:11]) # Count mutations in each population for pop_ii, fixed_str, count_str in zip( range(num_pops), pop_fixed_str, pop_count_str ): if fixed_str in by_individual: counts_this_mut[pop_ii] = pop_samples[pop_ii] else: counts_this_mut[pop_ii] = by_individual.count(count_str) # Initialize the list that will track the counts for this # mutation. Using setdefault means that it won't overwrite # if there's already a list stored there. mut_dict.setdefault(mut_id, [0] * num_pops) for ii in range(num_pops): if counts_this_mut[ii] > 0 and mut_dict[mut_id][ii] > 0: sys.stderr.write( "Contradicting counts between " "listings for mutation %s in " "population %i." % (mut_id, ii) ) mut_dict[mut_id][ii] = max( counts_this_mut[ii], mut_dict[mut_id][ii] ) line = fid.readline() # Now apply all the mutations with fixations that we deffered. for mut_id, counts in mut_dict.items(): if np.any(np.asarray(counts) > pop_samples): sys.stderr.write( "counts_this_mut > pop_samples: %s > " "%s\n%s\n" % (counts, pop_samples, mut_id) ) counts = np.minimum(counts, pop_samples) data[tuple(counts)] += 1 if newfile: fid.close() fs = Spectrum(data, mask_corners=mask_corners, pop_ids=pop_ids) if average: fs /= runs if not return_header: return fs else: return fs, (command, seeds)
[docs] def scramble_pop_ids(self, mask_corners=True): """ Spectrum corresponding to scrambling individuals among populations. This is useful for assessing how diverged populations are. Essentially, it pools all the individuals represented in the fs and generates new populations of random individuals (without replacement) from that pool. If this fs is significantly different from the original, that implies population structure. """ original_folded = self.folded # If we started with an folded Spectrum, we need to unfold before # projecting. if original_folded: self = self.unfold() total_samp = np.sum(self.sample_sizes) # First generate a 1d sfs for the pooled population. combined = np.zeros(total_samp + 1) # For each entry in the fs, this is the total number of derived alleles total_per_entry = self._total_per_entry() # Sum up to generate the equivalent 1-d spectrum. for derived, counts in zip(total_per_entry.ravel(), self.ravel()): combined[derived] += counts # Now resample back into a n-d spectrum # For each entry, this is the counts per popuation. # e.g. counts_per_entry[3,4,5] = [3,4,5] counts_per_entry = self._counts_per_entry() # Reshape it to be 1-d, so we can iterate over it easily. counts_per_entry = counts_per_entry.reshape(np.prod(self.shape), self.ndim) resamp = np.zeros(self.shape) for counts, derived in zip(counts_per_entry, total_per_entry.ravel()): # The probability here is # (t1 choose d1)*(t2 choose d2)/(ntot choose derived) lnprob = sum( Numerics._lncomb(t, d) for t, d in zip(self.sample_sizes, counts) ) lnprob -= Numerics._lncomb(total_samp, derived) prob = np.exp(lnprob) # Assign result using the appropriate weighting resamp[tuple(counts)] += prob * combined[derived] resamp = Spectrum(resamp, mask_corners=mask_corners) if not original_folded: return resamp else: return resamp.fold()
[docs] @staticmethod def from_data_dict( data_dict, pop_ids, projections, mask_corners=True, polarized=True ): """ Spectrum from a dictionary of polymorphisms. The data dictionary should be organized as: .. code-block:: {snp_id: { 'segregating': ['A','T'], 'calls': { 'YRI': (23,3), 'CEU': (7,3) }, 'outgroup_allele': 'T' }} The 'calls' entry gives the successful calls in each population, in the order that the alleles are specified in 'segregating'. Non-diallelic polymorphisms are skipped. :param pop_ids: list of which populations to make fs for. :param projections: list of sample sizes to project down to for each population. :param polarized: If True, the data are assumed to be correctly polarized by 'outgroup_allele'. SNPs in which the 'outgroup_allele' information is missing or '-' or not concordant with the segregating alleles will be ignored. If False, any 'outgroup_allele' info present is ignored, and the returned spectrum is folded. """ Npops = len(pop_ids) fs = np.zeros(np.asarray(projections) + 1) for snp, snp_info in data_dict.items(): # Skip SNPs that aren't biallelic. if len(snp_info["segregating"]) != 2: continue allele1, allele2 = snp_info["segregating"] if not polarized: # If we don't want to polarize, we can choose which allele is # derived arbitrarily since we'll fold anyways. outgroup_allele = allele1 elif ( "outgroup_allele" in snp_info and snp_info["outgroup_allele"] != "-" and snp_info["outgroup_allele"] in snp_info["segregating"] ): # Otherwise we need to check that it's a useful outgroup outgroup_allele = snp_info["outgroup_allele"] else: # If we're polarized and we didn't have good outgroup info, skip # this SNP. continue # Extract the allele calls for each population. allele1_calls = [snp_info["calls"][pop][0] for pop in pop_ids] allele2_calls = [snp_info["calls"][pop][1] for pop in pop_ids] # How many chromosomes did we call successfully in each population? successful_calls = [ a1 + a2 for (a1, a2) in zip(allele1_calls, allele2_calls) ] # Which allele is derived (different from outgroup)? if allele1 == outgroup_allele: derived_calls = allele2_calls elif allele2 == outgroup_allele: derived_calls = allele1_calls # To handle arbitrary numbers of populations in the fs, we need # to do some tricky slicing. slices = [[np.newaxis] * len(pop_ids) for ii in range(Npops)] for ii in range(len(pop_ids)): slices[ii][ii] = slice(None, None, None) # Do the projection for this SNP. pop_contribs = [] iter = zip(projections, successful_calls, derived_calls) for pop_ii, (p_to, p_from, hits) in enumerate(iter): contrib = Numerics._cached_projection(p_to, p_from, hits)[ tuple(slices[pop_ii]) ] pop_contribs.append(contrib) fs += functools.reduce(operator.mul, pop_contribs) fsout = Spectrum(fs, mask_corners=mask_corners, pop_ids=pop_ids) assert np.all(fsout >= 0) if polarized: if np.sum(fsout) == 0: warnings.warn( "Spectrum is empty. Did you compute the outgroup alleles " "at variable sites?", warnings.RuntimeWarning, ) return fsout else: if np.sum(fsout) == 0: warnings.warn( "Spectrum is empty. Check input data dictionary.", warnings.RuntimeWarning, ) return fsout.fold()
@staticmethod def _from_count_dict(count_dict, projections, polarized=True, pop_ids=None): """ Frequency spectrum from data mapping SNP configurations to counts. :param count_dict: Result of Misc.count_data_dict :param projections: List of sample sizes to project down to for each population. :param polarized: If True, only include SNPs that count_dict marks as polarized. If False, include all SNPs and fold resulting Spectrum. :param pop_ids: Optional list of strings containing the population labels. """ # create slices for projection calculation slices = [[np.newaxis] * len(projections) for ii in range(len(projections))] for ii in range(len(projections)): slices[ii][ii] = slice(None, None, None) fs_total = moments.Spectrum( np.zeros(np.array(projections) + 1), pop_ids=pop_ids ) for ( (called_by_pop, derived_by_pop, this_snp_polarized), count, ) in count_dict.items(): if polarized and not this_snp_polarized: continue pop_contribs = [] iter = zip(projections, called_by_pop, derived_by_pop) for pop_ii, (p_to, p_from, hits) in enumerate(iter): contrib = Numerics._cached_projection(p_to, p_from, hits)[ slices[pop_ii] ] pop_contribs.append(contrib) fs_proj = functools.reduce(operator.mul, pop_contribs) # create slices for adding projected fs to overall fs fs_total += count * fs_proj if polarized: return fs_total else: return fs_total.fold() @staticmethod def _data_by_tri(data_dict): """ Nest the data by derived context and outgroup base. The resulting dictionary contains only SNPs which are appropriate for use of Hernandez's ancestral misidentification correction. It is organized as {(derived_tri, outgroup_base): {snp_id: data,...}} """ result = {} genetic_bases = "ACTG" for snp, snp_info in data_dict.items(): # Skip non-diallelic polymorphisms if len(snp_info["segregating"]) != 2: continue allele1, allele2 = snp_info["segregating"] # Filter out SNPs where we either non-constant ingroup or outgroup # context. try: ingroup_tri = snp_info["context"] outgroup_tri = snp_info["outgroup_context"] except KeyError: continue if not outgroup_tri[1] == snp_info["outgroup_allele"]: raise ValueError( "Outgroup context and allele are inconsistent " "for polymorphism: %s." % snp ) outgroup_allele = outgroup_tri[1] # These are all the requirements to apply the ancestral correction. # First 2 are constant context. # Next 2 are sensible context. # Next 1 is that outgroup allele is one of the segregating. # Next 2 are that segregating alleles are sensible. if ( outgroup_tri[0] != ingroup_tri[0] or outgroup_tri[2] != ingroup_tri[2] or ingroup_tri[0] not in genetic_bases or ingroup_tri[2] not in genetic_bases or outgroup_allele not in [allele1, allele2] or allele1 not in genetic_bases or allele2 not in genetic_bases ): continue if allele1 == outgroup_allele: derived_allele = allele2 elif allele2 == outgroup_allele: # In this case, the second allele is non_outgroup derived_allele = allele1 derived_tri = ingroup_tri[0] + derived_allele + ingroup_tri[2] result.setdefault((derived_tri, outgroup_allele), {}) result[derived_tri, outgroup_allele][snp] = snp_info return result @staticmethod def _from_data_dict_corrected( data_dict, pop_ids, projections, fux_filename, force_pos=True, mask_corners=True ): """ Spectrum from a dictionary of polymorphisms, corrected for ancestral misidentification. The correction is based upon Hernandez, Williamson & Bustamante *Mol Biol Evol* 24:1792 (2007) :param force_pos: If the correction is too agressive, it may leave some small entries in the fs less than zero. If force_pos is true, these entries will be set to zero, in such a way that the total number of segregating SNPs is conserved. :param fux_filename: The name of the file containing the misidentification probabilities. The file is of the form: - # Any number of comments lines beginning with # - AAA T 0.001 - AAA G 0.02 - ... Where every combination of three + one bases is considered (order is not important). The triplet is the context and putatively derived allele (x) in the reference species. The single base is the base (u) in the outgroup. The numerical value is 1-f_{ux} in the notation of the paper. The data dictionary should be organized as: {snp_id:{'segregating': ['A','T'], 'calls': {'YRI': (23,3), 'CEU': (7,3) }, 'outgroup_allele': 'T', 'context': 'CAT', 'outgroup_context': 'CAT' } } The additional entries are 'context', which includes the two flanking bases in the species of interest, and 'outgroup_context', which includes the aligned bases in the outgroup. This method skips entries for which the correction cannot be applied. Most commonly this is because of missing or non-constant context. """ # Read the fux file into a dictionary. fux_dict = {} f = open(fux_filename) for line in f.readlines(): if line.startswith("#"): continue sp = line.split() fux_dict[(sp[0], sp[1])] = 1 - float(sp[2]) f.close() # Divide the data into classes based on ('context', 'outgroup_allele') by_context = Spectrum._data_by_tri(data_dict) fs = np.zeros(np.asarray(projections) + 1) while by_context: # Each time through this loop, we eliminate two entries from the # data dictionary. These correspond to one class and its # corresponding misidentified class. (derived_tri, out_base), nomis_data = by_context.popitem() # The corresponding bases if the ancestral state had been # misidentifed. mis_out_base = derived_tri[1] mis_derived_tri = derived_tri[0] + out_base + derived_tri[2] # Get the data for that case. Note that we default to an empty # dictionary if we don't have data for that class. mis_data = by_context.pop((mis_derived_tri, mis_out_base), {}) fux = fux_dict[(derived_tri, out_base)] fxu = fux_dict[(mis_derived_tri, mis_out_base)] # Get the spectra for these two cases Nux = Spectrum.from_data_dict(nomis_data, pop_ids, projections) Nxu = Spectrum.from_data_dict(mis_data, pop_ids, projections) # Equations 5 & 6 from the paper. Nxu_rev = Numerics.reverse_array(Nxu) Rux = (fxu * Nux - (1 - fxu) * Nxu_rev) / (fux + fxu - 1) Rxu = Numerics.reverse_array( (fux * Nxu_rev - (1 - fux) * Nux) / (fux + fxu - 1) ) fs += Rux + Rxu # Here we take the negative entries, and flip them back, so they end up # zero and the total number of SNPs is conserved. if force_pos: negative_entries = np.minimum(0, fs) fs -= negative_entries fs += Numerics.reverse_array(negative_entries) return Spectrum(fs, mask_corners=mask_corners, pop_ids=pop_ids)
[docs] @staticmethod def from_demes( g, sampled_demes=None, sample_sizes=None, sample_times=None, samples=None, Ne=None, unsampled_n=4, gamma=None, h=None, ): """ Takes a deme graph and computes the SFS. ``demes`` is a package for specifying demographic models in a user-friendly, human-readable YAML format. This function automatically parses the demographic description and returns a SFS for the specified populations and sample sizes. .. note:: If a deme sample time is requested that is earlier than the deme's end time, for example to simulate ancient samples, we must create a new population for that ancient sample. This can cause large slow-downs, as the computation cost of computing the SFS grows quickly in the number of populations. :param g: A ``demes`` DemeGraph from which to compute the SFS. The DemeGraph can either be specified as a YAML file, in which case `g` is a string, or as a ``DemeGraph`` object. :type g: str or :class:`demes.DemeGraph` :param sampled_demes: A list of deme IDs to take samples from. We can repeat demes, as long as the sampling of repeated deme IDs occurs at distinct times. :type sampled_demes: list of strings :param sample_sizes: A list of the same length as ``sampled_demes``, giving the sample sizes for each sampled deme. :type sample_sizes: list of ints :param sample_times: If None, assumes all sampling occurs at the end of the existence of the sampled deme. If there are ancient samples, ``sample_times`` must be a list of same length as ``sampled_demes``, giving the sampling times for each sampled deme. Sampling times are given in time units of the original deme graph, so might not necessarily be generations (e.g. if ``g.time_units`` is years) :type sample_times: list of floats, optional :param Ne: reference population size. If none is given, we use the initial size of the root deme. :type Ne: float, optional :param unsampled_n: The default sample size of unsampled demes, which must be greater than or equal to 4. :type unsampled_n: int, optional :param gamma: The scaled selection coefficient(s), 2*Ne*s. Defaults to None, which implies neutrality. Can be given as a scalar value, in which case all populations have the same selection coefficient. Alternatively, can be given as a dictionary, with keys given as population names in the input Demes model. Any population missing from this dictionary will be assigned a selection coefficient of zero. A non-zero default selection coefficient can be provided, using the key `_default`. See the Demes exension documentation for more details and examples. :type gamma: float or dict :param h: The dominance coefficient(s). Defaults to additivity (or genic selection). Can be given as a scalar value, in which case all populations have the same dominance coefficient. Alternatively, can be given as a dictionary, with keys given as population names in the input Demes model. Any population missing from this dictionary will be assigned a dominance coefficient of 1/2 (additivity). A different default dominance coefficient can be provided, using the key `_default`. See the Demes exension documentation for more details and examples. :type h: float or dict :return: A ``moments`` site frequency spectrum, with dimension equal to the length of ``sampled_demes``, and shape equal to ``sample_sizes`` plus one in each dimension, indexing the allele frequency in each deme from 0 to n[i], where i is the deme index. :rtype: :class:`moments.Spectrum` """ if isinstance(g, str): dg = demes.load(g) else: dg = g if samples is None: if sampled_demes is None or sample_sizes is None: raise ValueError( "must specify either samples (as a dict mapping demes to sample sizes," " or specify both sampled_demes and sample_times" ) elif samples is not None: if type(samples) is not dict: raise ValueError("samples must be a dict mapping demes to sample sizes") if sampled_demes is not None or sample_sizes is not None: raise ValueError( "if samples is given as dict, cannot " "specify sampled_demes or sample_sizes" ) if sample_times is not None: raise ValueError( "if samples is given as dict, cannot specify sample times" ) sampled_demes = list(samples.keys()) sample_sizes = list(samples.values()) fs = Demes.SFS( dg, sampled_demes, sample_sizes, sample_times=sample_times, Ne=Ne, unsampled_n=unsampled_n, gamma=gamma, h=h, ) return fs
[docs] @staticmethod def from_angsd( sfs_file, sample_sizes, pop_ids=None, folded=False, mask_corners=True ): """ Convert ANGSD output to a moments Spectrum object. The sample sizes are given as number of haploid genome copies (twice the number of sampled diploid individuals). :param string sfs_file: The n-dimensional SFS from ANGSD. This should be a file with a single line of numbers, as entries in the SFS. :param list sample_sizes: A list of integers with length equal to the number of population, storing the haploid sample size in each population. The order must match the population order provided to ANGSD. :param list pop_ids: A list of strings equal with length equal to the number of population, specifying the population name for each. :param bool folded: If False (default), we assume ancestral states are known, returning an unfolded SFS. If True, the returned SFS is folded. :param bool mask_corners: If True (default), mask the fixed bins in the SFS. If False, the fixed bins will remain unmasked. :return: A ``moments`` site frequency spectrum. :rtype: :class:`moments.Spectrum` """ if pop_ids is not None and len(pop_ids) != len(sample_sizes): raise ValueError( "length of pop ids must equal the number of " "populations given by sample sizes" ) if len(sample_sizes) > 2: warnings.warn( "Importing ANGSD-formatted data has only been tested for 2 populations", warnings.UserWarning, ) # get the SFS data with open(sfs_file) as f_data: data_line = f_data.readline() data = np.array([float(d) for d in data_line.split()]) # check that the length of data matches the number of bins in the SFS reshape_dims = [n + 1 for n in sample_sizes] if np.prod(reshape_dims) != len(data): raise ValueError("size of data does not match input sample sizes") data = np.reshape(data, reshape_dims) # turn into moments object fs = moments.Spectrum(data, mask_corners=mask_corners, pop_ids=pop_ids) if folded: fs = fs.fold() return fs
# The code below ensures that when I do arithmetic with Spectrum objects, # it is not done between a folded and an unfolded array. If it is, I raise # a ValueError. # While I'm at it, I'm also fixing the annoying behavior that if a1 and a2 # are masked arrays, and a3 = a1 + a2. Then wherever a1 or a2 was masked, # a3.data ends up with the a1.data values, rather than a1.data + a2.data. # Note that this fix doesn't work for operation by np.ma.exp and # np.ma.log. Guess I can't have everything. # I'm using exec here to avoid copy-pasting a dozen boiler-plate functions. # The calls to check_folding_equal ensure that we don't try to combine # folded and unfolded Spectrum objects. # I set check_folding = False in the constructor because it raises useless # warnings when, for example, I do (model + 1). # These functions also ensure that the pop_ids # get properly copied over. # This is pretty advanced Python voodoo, so don't fret if you don't # understand it at first glance. :-) for method in [ "__add__", "__radd__", "__sub__", "__rsub__", "__mul__", "__rmul__", "__div__", "__rdiv__", "__truediv__", "__rtruediv__", "__floordiv__", "__rfloordiv__", "__rpow__", "__pow__", ]: exec( """ def %(method)s(self, other): self._check_other_folding(other) if isinstance(other, np.ma.masked_array): newdata = self.data.%(method)s (other.data) newmask = np.ma.mask_or(self.mask, other.mask) else: newdata = self.data.%(method)s (other) newmask = self.mask newpop_ids = self.pop_ids if hasattr(other, 'pop_ids'): if other.pop_ids is None: newpop_ids = self.pop_ids elif self.pop_ids is None: newpop_ids = other.pop_ids elif other.pop_ids != self.pop_ids: logger.warn('Arithmetic between Spectra with different pop_ids. ' 'Resulting pop_id may not be correct.') outfs = self.__class__.__new__(self.__class__, newdata, newmask, mask_corners=False, data_folded=self.folded, check_folding=False, pop_ids=newpop_ids) return outfs """ % {"method": method} ) # Methods that modify the Spectrum in-place. for method in [ "__iadd__", "__isub__", "__imul__", "__idiv__", "__itruediv__", "__ifloordiv__", "__ipow__", ]: exec( """ def %(method)s(self, other): self._check_other_folding(other) if isinstance(other, np.ma.masked_array): self.data.%(method)s (other.data) self.mask = np.ma.mask_or(self.mask, other.mask) else: self.data.%(method)s (other) if hasattr(other, 'pop_ids') and other.pop_ids is not None\ and other.pop_ids != self.pop_ids: logger.warn('Arithmetic between Spectra with different pop_ids. ' 'Resulting pop_id may not be correct.') return self """ % {"method": method} ) def _check_other_folding(self, other): """ Ensure other Spectrum has same .folded status """ if isinstance(other, self.__class__) and other.folded != self.folded: raise ValueError( "Cannot operate with a folded Spectrum and an " "unfolded one." )
# Allow spectrum objects to be pickled. # See http://effbot.org/librarybook/copy-reg.htm try: import copy_reg def Spectrum_unpickler(data, mask, data_folded, pop_ids): return moments.Spectrum( data, mask, mask_corners=False, data_folded=data_folded, check_folding=False, pop_ids=pop_ids, ) def Spectrum_pickler(fs): return ( Spectrum_unpickler, (fs.data, fs.mask, fs.folded, fs.pop_ids), ) copy_reg.pickle(Spectrum, Spectrum_pickler, Spectrum_unpickler) except: import copyreg def Spectrum_unpickler(data, mask, data_folded, pop_ids): return moments.Spectrum( data, mask, mask_corners=False, data_folded=data_folded, check_folding=False, pop_ids=pop_ids, ) def Spectrum_pickler(fs): return ( Spectrum_unpickler, (fs.data, fs.mask, fs.folded, fs.pop_ids), ) copyreg.pickle(Spectrum, Spectrum_pickler, Spectrum_unpickler)