Source code for moments.Misc

Miscellaneous utility functions. Including ms simulation.

import bisect, collections, operator, os, sys, time

import numpy as np
import scipy.linalg

from . import Numerics
from . import Spectrum_mod
import functools

# Nucleotide order assumed in Q matrices.
code = "CGTA"

#: Storage for times at which each stream was flushed.
__times_last_flushed = {}

def delayed_flush(stream=sys.stdout, delay=1):
    Flush a stream, ensuring that it is only flushed every 'delay' *minutes*.
    Note that upon the first call to this method, the stream is not flushed.

    stream: The stream to flush. For this to work with simple 'print'
            statements, the stream should be sys.stdout.
    delay: Minimum time *in minutes* between flushes.

    This function is useful to prevent I/O overload on the cluster.
    global __times_last_flushed

    curr_time = time.time()
    # If this is the first time this method has been called with this stream,
    # we need to fill in the times_last_flushed dict. setdefault will do this
    # without overwriting any entry that may be there already.
    if stream not in __times_last_flushed:
        __times_last_flushed[stream] = curr_time
    last_flushed = __times_last_flushed[stream]

    # Note that time.time() returns values in seconds, hence the factor of 60.
    if (curr_time - last_flushed) >= delay * 60:
        __times_last_flushed[stream] = curr_time

def ensure_1arg_func(var):
    Ensure that var is actually a one-argument function.

    This is primarily used to convert arguments that are constants into
    trivial functions of time for use in integrations where parameters are
    allowed to change over time.
    if np.isscalar(var):
        # If a constant was passed in, use lambda to make it a nice
        #  simple function.
        var_f = lambda t: var
        var_f = var
    if not callable(var_f):
        raise ValueError("Argument is not a constant or a function.")
    except TypeError:
        raise ValueError("Argument is not a constant or a one-argument " "function.")
    return var_f

def ms_command(theta, ns, core, iter, recomb=0, rsites=None, seeds=None):
    Generate ms command for simulation from core.

    theta: Assumed theta
    ns: Sample sizes
    core: Core of ms command that specifies demography.
    iter: Iterations to run ms
    recomb: Assumed recombination rate
    rsites: Sites for recombination. If None, default is 10*theta.
    seeds: Seeds for random number generator. If None, ms default is used.
           Otherwise, three integers should be passed. Example: (132, 435, 123)
    if len(ns) > 1:
        ms_command = (
            "ms %(total_chrom)i %(iter)i -t %(theta)f -I %(numpops)i "
            "%(sample_sizes)s %(core)s"
        ms_command = "ms %(total_chrom)i %(iter)i -t %(theta)f  %(core)s"

    if recomb:
        ms_command = ms_command + " -r %(recomb)f %(rsites)i"
        if not rsites:
            rsites = theta * 10
    sub_dict = {
        "total_chrom": np.sum(ns),
        "iter": iter,
        "theta": theta,
        "numpops": len(ns),
        "sample_sizes": " ".join(map(str, ns)),
        "core": core,
        "recomb": recomb,
        "rsites": rsites,

    ms_command = ms_command % sub_dict

    if seeds is not None:
        seed_command = " -seeds %i %i %i" % (seeds[0], seeds[1], seeds[2])
        ms_command = ms_command + seed_command

    return ms_command

[docs]def perturb_params(params, fold=1, lower_bound=None, upper_bound=None): """ Generate a perturbed set of parameters. Each element of params is randomly perturbed `fold` factors of 2 up or down. :param fold: Number of factors of 2 to perturb by, defaults to 1. :type fold: float, optional :param lower_bound: If not None, the resulting parameter set is adjusted to have all value greater than lower_bound. :type lower_bound: list of floats, optional :param upper_bound: If not None, the resulting parameter set is adjusted to have all value less than upper_bound. :type upper_bound: list of floats, optional """ pnew = params * 2 ** (fold * (2 * np.random.random(len(params)) - 1)) if lower_bound is not None: for ii, bound in enumerate(lower_bound): if bound is None: lower_bound[ii] = -np.inf pnew = np.maximum(pnew, 1.01 * np.asarray(lower_bound)) if upper_bound is not None: for ii, bound in enumerate(upper_bound): if bound is None: upper_bound[ii] = np.inf pnew = np.minimum(pnew, 0.99 * np.asarray(upper_bound)) return pnew
def make_fux_table(fid, ts, Q, tri_freq): """ Make file of 1-fux for use in ancestral misidentification correction. fid: Filename to output to. ts: Expected number of substitutions per site between ingroup and outgroup. Q: Trinucleotide transition rate matrix. This should be a 64x64 matrix, in which entries are ordered using the code CGTA -> 0,1,2,3. For example, ACT -> 3*16+0*4+2*1=50. The transition rate from ACT to AGT is then entry 50,54. tri_freq: Dictionary in which each entry maps a trinucleotide to its ancestral frequency. e.g. {'AAA': 0.01, 'AAC':0.012...} Note that should be the frequency in the entire region scanned for variation, not just sites where there are SNPs. """ # Ensure that the *columns* of Q sum to zero. # That is the correct condition when Q_{i,j} is the rate from i to j. # This indicates a typo in Hernandez, Williamson, and Bustamante. for ii in range(Q.shape[1]): s = Q[:, ii].sum() - Q[ii, ii] Q[ii, ii] = -s eQhalf = scipy.linalg.matfuncs.expm(Q * ts / 2.0) if not hasattr(fid, "write"): newfile = True fid = open(fid, "w") outlines = [] for first_ii, first in enumerate(code): for x_ii, x in enumerate(code): for third_ii, third in enumerate(code): # This the index into Q and eQ xind = 16 * first_ii + 4 * x_ii + 1 * third_ii for u_ii, u in enumerate(code): # This the index into Q and eQ uind = 16 * first_ii + 4 * u_ii + 1 * third_ii ## Note that the Q terms factor out in our final ## calculation, because for both PMuUu and PMuUx the final ## factor in Eqn 2 is P(S={u,x}|M=u). # Qux = Q[uind,xind] # denomu = Q[uind].sum() - Q[uind,uind] PMuUu, PMuUx = 0, 0 # Equation 2 in HWB. We have to generalize slightly to # calculate PMuUx. In calculate PMuUx, we're summing over # alpha the probability that the MRCA was alpha, and it # substituted to x on the outgroup branch, and it # substituted to u on the ingroup branch, and it mutated to # x in the ingroup (conditional on it having mutated in the # ingroup). Note that the mutation to x condition cancels # in fux, so we don't bother to calculate it. for aa, alpha in enumerate(code): aind = 16 * first_ii + 4 * aa + 1 * third_ii pia = tri_freq[first + alpha + third] Pau = eQhalf[aind, uind] Pax = eQhalf[aind, xind] PMuUu += pia * Pau * Pau PMuUx += pia * Pau * Pax # This is 1-fux. For a given SNP with actual ancestral state # u and derived allele x, this is 1 minus the probability # that the outgroup will have u. # Eqn 3 in HWB. res = 1 - PMuUu / (PMuUu + PMuUx) # These aren't SNPs, so we can arbitrarily set them to 0 if u == x: res = 0 outlines.append("%c%c%c %c %.6f" % (first, x, third, u, res)) fid.write(os.linesep.join(outlines)) if newfile: fid.close() def zero_diag(Q): """ Copy of Q altered such that diagonal entries are all 0. """ Q_nodiag = Q.copy() for ii in range(Q.shape[0]): Q_nodiag[ii, ii] = 0 return Q_nodiag def tri_freq_dict_to_array(tri_freq_dict): """ Convert dictionary of trinucleotide frequencies to array in correct order. """ tripi = np.zeros(64) for ii, left in enumerate(code): for jj, center in enumerate(code): for kk, right in enumerate(code): row = ii * 16 + jj * 4 + kk tripi[row] = tri_freq_dict[left + center + right] return tripi def total_instantaneous_rate(Q, pi): """ Total instantaneous substitution rate. """ Qzero = zero_diag(Q) return, Qzero).sum() def make_data_dict(filename): """ Parse a file containing genomic sequence information in the format described by the wiki, and store the information in a properly formatted dictionary. filename: Name of file to work with. The file can be zipped (extension .zip) or gzipped (extension .gz). If zipped, there must be only a single file in the zip archive. """ if os.path.splitext(filename)[1] == ".gz": import gzip f = elif os.path.splitext(filename)[1] == ".zip": import zipfile archive = zipfile.ZipFile(filename) namelist = archive.namelist() if len(namelist) != 1: raise ValueError( "Must be only a single data file in zip " "archive: %s" % filename ) f =[0]) else: f = open(filename) # Skip to the header while True: header = f.readline() if not header.startswith("#"): break allele2_index = header.split().index("Allele2") # Pull out our pop ids pops = header.split()[3:allele2_index] # The empty data dictionary data_dict = {} # Now walk down the file for SNP_ii, line in enumerate(f): if line.startswith("#"): continue # Split the into fields by whitespace spl = line.split() data_this_snp = {} # We convert to upper case to avoid any issues with mixed case between # SNPs. data_this_snp["context"] = spl[0].upper() data_this_snp["outgroup_context"] = spl[1].upper() data_this_snp["outgroup_allele"] = spl[1][1].upper() data_this_snp["segregating"] = spl[2].upper(), spl[allele2_index].upper() calls_dict = {} for ii, pop in enumerate(pops): calls_dict[pop] = int(spl[3 + ii]), int(spl[allele2_index + 1 + ii]) data_this_snp["calls"] = calls_dict # We name our SNPs using the final columns snp_key = ( spl[allele2_index + len(pops) + 1], spl[allele2_index + len(pops) + 2], ) if snp_key == "": snp_key = ("SNP", f"{SNP_ii}") data_dict[snp_key] = data_this_snp return data_dict
[docs]def count_data_dict(data_dict, pop_ids): """ Summarize data in data_dict by mapping SNP configurations to counts. Returns a dictionary with keys (successful_calls, derived_calls, polarized) mapping to counts of SNPs. Here successful_calls is a tuple with the number of good calls per population, derived_calls is a tuple of derived calls per pop, and polarized indicates whether that SNP was polarized using an ancestral state. :param data_dict: data_dict formatted as in Misc.make_data_dict :type data_dict: data dictionary :param pop_ids: IDs of populations to collect data for :type pop_ids: list of strings """ count_dict = collections.defaultdict(int) for snp_key, snp_info in data_dict.items(): # Skip SNPs that aren't biallelic. if len(snp_info["segregating"]) != 2: continue allele1, allele2 = snp_info["segregating"] if ( "outgroup_allele" in snp_info and snp_info["outgroup_allele"] != "-" and snp_info["outgroup_allele"] in snp_info["segregating"] ): outgroup_allele = snp_info["outgroup_allele"] this_snp_polarized = True else: outgroup_allele = allele1 this_snp_polarized = False # Extract the allele calls for each population. allele1_calls = [snp_info["calls"][pop][0] for pop in pop_ids] allele2_calls = [snp_info["calls"][pop][1] for pop in pop_ids] # How many chromosomes did we call successfully in each population? successful_calls = [a1 + a2 for (a1, a2) in zip(allele1_calls, allele2_calls)] # Which allele is derived (different from outgroup)? if allele1 == outgroup_allele: derived_calls = allele2_calls elif allele2 == outgroup_allele: derived_calls = allele1_calls # Update count_dict count_dict[ tuple(successful_calls), tuple(derived_calls), this_snp_polarized ] += 1 return count_dict
[docs]def make_data_dict_vcf( vcf_filename, popinfo_filename, filter=True, flanking_info=[None, None], skip_multiallelic=True, ): """ Parse a VCF file containing genomic sequence information, along with a file identifying the population of each sample, and store the information in a properly formatted dictionary. Each file may be zipped (.zip) or gzipped (.gz). If a file is zipped, it must be the only file in the archive, and the two files cannot be zipped together. Both files must be present for the function to work. :param vcf_filename: Name of VCF file to work with. The function currently works for biallelic SNPs only, so if REF or ALT is anything other than a single base pair (A, C, T, or G), the allele will be skipped. Additionally, genotype information must be present in the FORMAT field GT, and genotype info must be known for every sample, else the SNP will be skipped. If the ancestral allele is known it should be specified in INFO field 'AA'. Otherwise, it will be set to '-'. :type vcf_filename: str :param popinfo_filename: Name of file containing the population assignments for each sample in the VCF. If a sample in the VCF file does not have a corresponding entry in this file, it will be skipped. See _get_popinfo for information on how this file must be formatted. :type popinfo_filename: str :param filter: If set to True, alleles will be skipped if they have not passed all filters (i.e. either 'PASS' or '.' must be present in FILTER column. :type filter: bool, optional :param flanking_info: Flanking information for the reference and/or ancestral allele can be provided as field(s) in the INFO column. To add this information to the dict, flanking_info should specify the names of the fields that contain this info as a list (e.g. ['RFL', 'AFL'].) If context info is given for only one allele, set the other item in the list to None, (e.g. ['RFL', None]). Information can be provided as a 3 base-pair sequence or 2 base-pair sequence, where the first base-pair is the one immediately preceding the SNP, and the last base-pair is the one immediately following the SNP. :type flanking_info: list of strings, optional :param skip_multiallelic: If True, only keep biallelic sites, and skip sites that have more than one ALT allele. :type skip_multiallelic: bool, optional """ if not skip_multiallelic: raise ValueError( "We can only keep biallelic sites, and multiallelic tallying is not " "currently supported. Set skip_multiallelic to True." ) # Read population information from file based on extension if os.path.splitext(popinfo_filename)[1] == ".gz": import gzip popinfo_file = elif os.path.splitext(popinfo_filename)[1] == ".zip": import zipfile archive = zipfile.ZipFile(popinfo_filename) namelist = archive.namelist() if len(namelist) != 1: raise ValueError( "Must be only a single popinfo file in zip " "archive: {}".format(popinfo_filename) ) popinfo_file =[0]) else: popinfo_file = open(popinfo_filename) # pop_dict has key, value pairs of "SAMPLE_NAME" : "POP_NAME" popinfo_dict = _get_popinfo(popinfo_file) popinfo_file.close() # Open VCF file if os.path.splitext(vcf_filename)[1] == ".gz": import gzip vcf_file = elif os.path.splitext(vcf_filename)[1] == ".zip": import zipfile archive = zipfile.ZipFile(vcf_filename) namelist = archive.namelist() if len(namelist) != 1: raise ValueError( "Must be only a single vcf file in zip " "archive: {}".format(vcf_filename) ) vcf_file =[0]) else: vcf_file = open(vcf_filename) data_dict = {} for line in vcf_file: # decoding lines for Python 3 - probably a better way to handle this try: line = line.decode() except AttributeError: pass # Skip metainformation if line.startswith("##"): continue # Read header if line.startswith("#"): header_cols = line.split() # Ensure there is at least one sample if len(header_cols) <= 9: raise ValueError("No samples in VCF file") # Use popinfo_dict to get the order of populations present in VCF poplist = [ popinfo_dict[sample] if sample in popinfo_dict else None for sample in header_cols[9:] ] continue # Read SNP data cols = line.split() snp_key = (cols[0], cols[1]) # (CHROM, POS) snp_dict = {} # Skip SNP if filter is set to True and it fails a filter test if filter and cols[6] != "PASS" and cols[6] != ".": continue # Add reference and alternate allele info to dict ref, alt = (allele.upper() for allele in cols[3:5]) if ref not in ["A", "C", "G", "T"] or alt not in ["A", "C", "G", "T"]: # Skip line if site is not an SNP continue snp_dict["segregating"] = (ref, alt) snp_dict["context"] = "-" + ref + "-" # Add ancestral allele information if available info = cols[7].split(";") for field in info: if field.startswith("AA"): outgroup_allele = field[3:].upper() if outgroup_allele not in ["A", "C", "G", "T"]: # Skip if ancestral not single base A, C, G, or T outgroup_allele = "-" break else: outgroup_allele = "-" snp_dict["outgroup_allele"] = outgroup_allele snp_dict["outgroup_context"] = "-" + outgroup_allele + "-" # Add flanking info if it is present rflank, aflank = flanking_info for field in info: if rflank and field.startswith(rflank): flank = field[len(rflank + 1) :].upper() if not (len(flank) == 2 or len(flank) == 3): continue prevb, nextb = flank[0], flank[-1] if prevb not in ["A", "C", "T", "G"]: prevb = "-" if nextb not in ["A", "C", "T", "G"]: nextb = "-" snp_dict["context"] = prevb + ref + nextb continue if aflank and field.startswith(aflank): flank = field[len(aflank + 1) :].upper() if not (len(flank) == 2 or len(flank) == 3): continue prevb, nextb = flank[0], flank[-1] if prevb not in ["A", "C", "T", "G"]: prevb = "-" if nextb not in ["A", "C", "T", "G"]: nextb = "-" snp_dict["outgroup_context"] = prevb + outgroup_allele + nextb # Add reference and alternate allele calls for each population calls_dict = {} gtindex = cols[8].split(":").index("GT") for pop, sample in zip(poplist, cols[9:]): if pop is None: continue gt = sample.split(":")[gtindex] g1, g2 = gt[0], gt[2] if pop not in calls_dict: calls_dict[pop] = (0, 0) refcalls, altcalls = calls_dict[pop] refcalls += int(g1 == "0") + int(g2 == "0") altcalls += int(g1 == "1") + int(g2 == "1") calls_dict[pop] = (refcalls, altcalls) snp_dict["calls"] = calls_dict data_dict[snp_key] = snp_dict vcf_file.close() return data_dict
def _get_popinfo(popinfo_file): """ Helper function for make_data_dict_vcf. Takes an open file that contains information on the population designations of each sample within a VCF file, and returns a dictionary containing {"SAMPLE_NAME" : "POP_NAME"} pairs. The file should be formatted as a table, with columns delimited by whitespace, and rows delimited by new lines. Lines beginning with '#' are considered comments and will be ignored. Each sample must appear on its own line. If no header information is provided, the first column will be assumed to be the SAMPLE_NAME column, while the second column will be assumed to be the POP_NAME column. If a header is present, it must be the first non-comment line of the file. The column positions of the words "SAMPLE" and "POP" (ignoring case) in this header will be used to determine proper positions of the SAMPLE_NAME and POP_NAME columns in the table. popinfo_file : An open text file of the format described above. """ popinfo_dict = {} sample_col = 0 pop_col = 1 header = False # check for header info for line in popinfo_file: if line.startswith("#"): continue cols = [col.lower() for col in line.split()] if "sample" in cols: header = True sample_col = cols.index("sample") if "pop" in cols: header = True pop_col = cols.index("pop") break # read in population information for each sample for line in popinfo_file: if line.startswith("#"): continue cols = line.split() sample = cols[sample_col] pop = cols[pop_col] # avoid adding header to dict if (sample.lower() == "sample" or pop.lower() == "pop") and header: header = False continue popinfo_dict[sample] = pop return popinfo_dict
[docs]def bootstrap( data_dict, pop_ids, projections, mask_corners=True, polarized=True, bed_filename=None, num_boots=100, save_dir=None, ): """ Use a non-parametric bootstrap on SNP information contained in a dictionary to generate new data sets. The new data is created by sampling with replacement from independent units of the original data. These units can simply be chromosomes, or they can be regions specified in a BED file. This function either returns a list of all the newly created SFS, or writes them to disk in a specified directory. See :func:`moments.Spectrum.from_data_dict` for more details about the options for creating spectra. :param data_dict: Dictionary containing properly formatted SNP information (i.e. created using one of the make_data_dict methods). :type data_dict: dict of SNP information :param pop_ids: List of population IDs. :type pop_ids: list of strings :param projections: Projection sizes for the given population IDs. :type projections: list of ints :param mask_corners: If True, mask the invariant bins of the SFS. :type mask_corners: bool, optional :param polarized: If True, we assume we know the ancestral allele. If False, return folded spectra. :type polarized: bool, optional :param bed_filename: If None, chromosomes will be used as the units for resampling. Otherwise, this should be the filename of a BED file specifying the regions to be used as resampling units. Chromosome names must be consistent between the BED file and the data dictionary, or bootstrap will not work. For example, if an entry in the data dict has ID X_Y, then the value in in the chromosome field of the BED file must also be X (not chrX, chromosomeX, etc.). If the name field is provided in the BED file, then any regions with the same name will be considered to be part of the same unit. This may be useful for sampling as one unit a gene that is located across non-continuous regions. :type bed_filename: string as path to bed file :param num_boots: Number of resampled SFS to generate. :type num_boots: int, optional :param save_dir: If None, the SFS are returned as a list. Otherwise this should be a string specifying the name of a new directory under which all of the new SFS should be saved. :type save_dir: str, optional """ # Read in information from BED file if present and store by chromosome if bed_filename is not None: bed_file = open(bed_filename) bed_info_dict = {} for linenum, line in enumerate(bed_file): fields = line.split() # Read in mandatory fields chrom, start, end = fields[:3] start = int(start) end = int(end) # Read label info if present, else assign unique label by line number label = linenum if len(fields) >= 4: label = fields[3] # Add information to the appropriate chromosome if chrom not in bed_info_dict: bed_info_dict[chrom] = [] bed_info_dict[chrom].append((start, end, label)) bed_file.close() # Sort entries by start position, for easier location of proper region start_dict = {} for chrom, bed_info in bed_info_dict.items(): bed_info.sort(key=lambda k: k[0]) start_dict[chrom] = [region[0] for region in bed_info] # Dictionary will map region labels to the SNPs contained in that region region_dict = {} # Iterate through data_dict and add SNPs to proper region for snp_key in data_dict: chrom, pos = snp_key pos = int(pos) # Quickly locate proper region in sorted list loc = bisect.bisect_right(start_dict[chrom], pos) - 1 if loc >= 0 and bed_info_dict[chrom][loc][1] >= pos: label = bed_info_dict[chrom][loc][2] if label not in region_dict: region_dict[label] = [] region_dict[label].append(snp_key) # Separate by chromosome if no BED file provided else: region_dict = {} for snp_key in data_dict: chrom, pos = snp_key if chrom not in region_dict: region_dict[chrom] = [] region_dict[chrom].append(snp_key) # Each entry of list represents single region, with a tuple # containing the IDs of all SNPs in the region. sample_regions = [tuple(val) for key, val in region_dict.items()] num_regions = len(sample_regions) if save_dir is None: new_sfs_list = [] elif not os.path.exists(save_dir): os.makedirs(save_dir) # Repeatedly resample regions to create new data sets for bootnum in range(num_boots): # Set up new SFS npops = len(pop_ids) new_sfs = np.zeros(np.asarray(projections) + 1) # Make random selection of regions (with replacement) choices = np.random.randint(0, num_regions, num_regions) # For each selected region, add its SNP info to SFS for choice in choices: for snp_key in sample_regions[choice]: snp_info = data_dict[snp_key] # Skip SNPs that aren't biallelic. if len(snp_info["segregating"]) != 2: continue allele1, allele2 = snp_info["segregating"] if not polarized: # If not polarizing, derived allele is arbitrary outgroup_allele = allele1 elif ( "outgroup_allele" in snp_info and snp_info["outgroup_allele"] != "-" and snp_info["outgroup_allele"] in snp_info["segregating"] ): # Otherwise check that it is a useful outgroup outgroup_allele = snp_info["outgroup_allele"] else: # If polarized and without good outgroup, skip SNP continue # Extract allele calls for each population. allele1_calls = [snp_info["calls"][pop][0] for pop in pop_ids] allele2_calls = [snp_info["calls"][pop][1] for pop in pop_ids] successful_calls = [ a1 + a2 for (a1, a2) in zip(allele1_calls, allele2_calls) ] derived_calls = ( allele2_calls if allele1 == outgroup_allele else allele1_calls ) # Slicing allows handling of arbitray population numbers slices = [[np.newaxis] * npops for i in range(npops)] for i in range(npops): slices[i][i] = slice(None, None, None) # Do projections for this SNP pop_contribs = [] call_iter = zip(projections, successful_calls, derived_calls) for pop_index, (p_to, p_from, hits) in enumerate(call_iter): contrib = Numerics._cached_projection(p_to, p_from, hits)[ tuple(slices[pop_index]) ] pop_contribs.append(contrib) new_sfs += functools.reduce(operator.mul, pop_contribs) new_sfs = Spectrum_mod.Spectrum( new_sfs, mask_corners=mask_corners, pop_ids=pop_ids ) if not polarized: new_sfs.fold() if save_dir is None: new_sfs_list.append(new_sfs) else: filename = "{}/SFS_{}".format(save_dir, bootnum) new_sfs.to_file(filename) return new_sfs_list if save_dir is None else None
def flip_ancestral_misid(fs, p_misid): if p_misid < 0 or p_misid > 1: raise ValueError( "probability of misidentification must be between zero and one." ) fs_misid = (1 - p_misid) * fs + p_misid * np.flip(fs) return fs_misid