Source code for moments.LD.Parsing

_imported_h5py = 0
_imported_allel = 0
_imported_pandas = 0
import os

try:
    os.environ["NUMEXPR_MAX_THREADS"] = "272"
    import allel

    _imported_allel = 1
except ImportError:
    pass

try:
    import h5py

    _imported_h5py = 1
except ImportError:
    pass

try:
    import pandas

    _imported_pandas = 1
except ImportError:
    pass

from . import Util


def check_imports():
    if _imported_allel == 0:
        raise ("Failed to import allel package needed for Parsing.")
    if _imported_h5py == 0:
        raise ("Failed to import h5py package needed for Parsing.")
    if _imported_pandas == 0:
        raise ("Failed to import pandas package needed for Parsing.")


import numpy as np
from collections import Counter, defaultdict

# from . import stats_from_genotype_counts as sgc
from . import stats_from_haplotype_counts as shc
import sys
import itertools

ld_extensions = 0
try:
    import genotype_calculations as gcs
    import genotype_calculations_multipop as gcs_mp
    import sparse_tallying as spt

    ld_extensions = 1
except ImportError:
    pass

# turn off UserWarnings from allel
import warnings

if _imported_allel:
    warnings.filterwarnings(
        action="ignore", message="'GQ' FORMAT", category=UserWarning
    )


def _load_h5(vcf_file, report=True):
    check_imports()
    ## open the h5 callset, create if doesn't exist
    ## note that if the h5 file exists, but isn't properly written,
    ## you will need to delete and recreate
    ## saves h5 callset as same name and path,
    ## but with h5 extension instead of vcf or vcf.gz
    h5_file_path = vcf_file.split(".vcf")[0] + ".h5"
    try:
        callset = h5py.File(h5_file_path, mode="r")
    except (OSError, IOError):  # IOError merged into OSError in python 3
        if report is True:
            print("creating and saving h5 file")
            sys.stdout.flush()
        allel.vcf_to_hdf5(
            vcf_file,
            h5_file_path,
            fields="*",
            exclude_fields=["calldata/GQ"],
            overwrite=True,
        )
        callset = h5py.File(h5_file_path, mode="r")
    return callset


[docs]def get_genotypes( vcf_file, bed_file=None, chromosome=None, min_bp=None, use_h5=True, report=True ): """ Given a vcf file, we extract the biallelic SNP genotypes. If bed_file is None, we use all valid variants. Otherwise we filter genotypes by the given bed file. If chromosome is given, filters to keep snps only in given chrom (useful for vcfs spanning multiple chromosomes). If use_h5 is True, we try to load the h5 file, which has the same path/name as vcf_file, but with {fname}.h5 instead of {fname}.vcf or {fname}.vcf.gz. If the h5 file does not exist, we create it and save it as {fname}.h5 Returns (biallelic positions, biallelic genotypes, biallelic allele counts, sampled ids). :param vcf_file: A VCF-formatted file. :type vcf_file: str :param bed_file: A bed file specifying regions to compute statistics from. The chromosome name formatting must match the chromosome name formatting of the input VCF (i.e., both carry the leading "chr" or both omit it). :type bed_file: str, optional :param min_bp: only used with bed file, filters out features that are smaller than ``min_bp``. :type min_bp: int, optional :param chromosome: Chromosome to compute LD statistics from. :type chromosome: int or str, optional :param use_h5: If use_h5 is True, we try to load the h5 file, which has the same path/name as vcf_file, but with .h5 instead of .vcf or .vcf.gz extension. If the h5 file does not exist, we create it and save it with .h5 extension. Defaults to True. :type use_h5: bool, optional :param report: Prints progress updates if True, silent otherwise. Defaults to True. :type report: bool, optional """ check_imports() if use_h5 is True: callset = _load_h5(vcf_file, report=report) else: callset = allel.read_vcf(vcf_file) all_genotypes = allel.GenotypeChunkedArray(callset["calldata/GT"]) all_positions = callset["variants/POS"][:] # if there are multiple chromosomes in the VCF, we require a specified chromosome try: all_chromosomes = np.array([c.decode() for c in callset["variants/CHROM"][:]]) except AttributeError: all_chromosomes = callset["variants/CHROM"][:] num_chromosomes = len(set(all_chromosomes)) if num_chromosomes == 1: if chromosome is None: chromosome = all_chromosomes[0] elif num_chromosomes > 1: if chromosome is None: raise ValueError( "The input VCF has more than one chromosome present. " "The `chromosome` must be specified." ) if str(chromosome) not in all_chromosomes: raise ValueError( f"The specified chromosome, {chromosome}, was not found among " "sites in the VCF. Double check the input chromosome name." ) if num_chromosomes > 1: in_chromosome = all_chromosomes == str(chromosome) all_positions = all_positions.compress(in_chromosome) all_genotypes = all_genotypes.compress(in_chromosome) if bed_file is not None: bed_file_data = pandas.read_csv(bed_file, delim_whitespace=True, header=None) bed_chromosomes = np.array(bed_file_data[0]) bed_lefts = np.array(bed_file_data[1]) bed_rights = np.array(bed_file_data[2]) # only keep rows of bed file that match our chromosome bed_chrom_filter = bed_chromosomes.astype(str) == str(chromosome) bed_chromosomes = bed_chromosomes.compress(bed_chrom_filter) if len(bed_chromosomes) == 0: raise ValueError( "No regions of the bed file matched the input chromosome. " "Check that chromosome names match between bed file and VCF." ) bed_lefts = bed_lefts.compress(bed_chrom_filter) bed_rights = bed_rights.compress(bed_chrom_filter) # if a minimum length of feature is specified, only keep long enough features if min_bp is not None: bp_filter = bed_rights - bed_lefts >= min_bp bed_lefts = bed_lefts.compress(bp_filter) bed_rights = bed_rights.compress(bp_filter) if len(bed_lefts) == 0: raise ValueError( "No features in bed file were at least {min_bp} in length." ) in_bed = all_positions < 0 for left, right in zip(bed_lefts, bed_rights): in_bed = np.logical_or( in_bed, np.logical_and(all_positions >= left, all_positions < right) ) all_positions = all_positions.compress(in_bed) all_genotypes = all_genotypes.compress(in_bed) all_genotypes_012 = all_genotypes.to_n_alt(fill=-1) # count alleles and only keep biallelic positions allele_counts = all_genotypes.count_alleles() is_biallelic = allele_counts.is_biallelic_01() biallelic_positions = all_positions.compress(is_biallelic) biallelic_genotypes_012 = all_genotypes_012.compress(is_biallelic) biallelic_allele_counts = allele_counts.compress(is_biallelic) biallelic_genotypes = all_genotypes.compress(is_biallelic) relevant_column = np.array([False] * biallelic_allele_counts.shape[1]) relevant_column[0:2] = True biallelic_allele_counts = biallelic_allele_counts.compress(relevant_column, axis=1) # protect against variable encoding of sample id strings try: sample_ids = np.array([sid.decode() for sid in callset["samples"]]) except AttributeError: sample_ids = callset["samples"] return biallelic_positions, biallelic_genotypes, biallelic_allele_counts, sample_ids
def _assign_r_pos(positions, rec_map): rs = np.zeros(len(positions)) for ii, pos in enumerate(positions): if pos in np.array(rec_map[0]): rs[ii] = np.array(rec_map[1])[np.argwhere(pos == np.array(rec_map[0]))[0]] else: ## for now, if outside rec map, assign to nearest point, ## but later want to drop these positions if pos < rec_map[0].iloc[0]: rs[ii] = rec_map[1].iloc[0] elif pos > rec_map[0].iloc[-1]: rs[ii] = rec_map[1].iloc[-1] else: map_ii = np.where(pos >= np.array(rec_map[0]))[0][-1] l = rec_map[0][map_ii] r = rec_map[0][map_ii + 1] v_l = rec_map[1][map_ii] v_r = rec_map[1][map_ii + 1] rs[ii] = v_l + (v_r - v_l) * (pos - l) / (r - l) return rs def _assign_recombination_rates( positions, map_file, map_name=None, map_sep=None, cM=True, report=True ): ## map_sep is now deprecated. we split by any white space if map_file == None: raise ValueError( "Need to pass a recombination map file. Otherwise can bin by physical distance." ) sys.stdout.flush() try: rec_map = pandas.read_csv(map_file, delim_whitespace=True) except: raise ValueError("Error loading recombination map.") sys.stdout.flush() map_positions = rec_map[rec_map.keys()[0]] if map_name == None: # we use the first map column if report is True: print("No recombination map name given, using first column.") sys.stdout.flush() map_values = rec_map[rec_map.keys()[1]] else: try: map_values = rec_map[map_name] except KeyError: print( "WARNING: map_name did not match map names in recombination map file. Using the first column." ) map_values = rec_map[rec_map.keys()[1]] # for positions sticking out the end of the map, they take the value of the closest position # ideally, you'd filter these out if cM == True: map_values /= 100 pos_rs = _assign_r_pos(positions, [map_positions, map_values]) return pos_rs ## We store a Genotype matrix, which has size L x n, in sparse format ## Indexed by position in Genotype array (0...L-1) ## and G_dict[i] = {1: J, 2: K} ## where J and K are sets of the diploid individual indices that ## have genotypes 1 or 2 ## If there is any missing data, we also store set of individuals with -1's def _sparsify_genotype_matrix(G): G_dict = {} if np.any(G == -1): missing = True else: missing = False for i in range(len(G)): G_dict[i] = { 1: set(np.where(G[i, :] == 1)[0]), 2: set(np.where(G[i, :] == 2)[0]), } if missing == True: G_dict[i][-1] = set(np.where(G[i, :] == -1)[0]) else: G_dict[i][-1] = set() return G_dict, missing def _sparsify_haplotype_matrix(G): G_dict = {} if np.any(G == -1): missing = True else: missing = False for i in range(len(G)): G_dict[i] = { 1: set(np.where(G[i, :] == 1)[0]), } if missing == True: G_dict[i][-1] = set(np.where(G[i, :] == -1)[0]) else: G_dict[i][-1] = set() return G_dict, missing def _check_valid_genotype_matrix(G, genotypes): if genotypes: # 0, 1, 2 for genotype values, -1 for missing data if not np.all( np.logical_or(np.logical_or(np.logical_or(G == 0, G == 1), G == 2), G == -1) ): raise ValueError( "Genotype matrix must have values of 0, 1, or 2, or -1 for missing data" ) else: # haplotypes: 0, 1, -1 for missing data if not np.all(np.logical_or(np.logical_or(G == 0, G == 1), G == -1)): raise ValueError( "Haplotype matrix must have values of 0 or 1, or -1 for missing data" ) L, n = np.shape(G) if L > 46340: raise ValueError("Genotype matrix is too large, consider parallelizing LD calc")
[docs]def compute_pairwise_stats(Gs, genotypes=True): """ Computes :math:`D^2`, :math:`Dz`, :math:`\\pi_2`, and :math:`D` for every pair of loci within a block of SNPs, coded as a genotype matrix. :param Gs: A genotype matrix, of size L-by-n, where L is the number of loci and n is the sample size. Missing data is encoded as -1. :param genotypes: If True, use 0, 1, 2 genotypes. If False, use 0, 1 phased haplotypes. """ if ld_extensions != 1: raise ValueError( "Need to build LD cython extensions. " "Install moments with the flag `--ld_extensions`" ) _check_valid_genotype_matrix(Gs, genotypes) L, n = np.shape(Gs) if genotypes: G_dict, any_missing = _sparsify_genotype_matrix(Gs) Counts = spt.count_genotypes_sparse(G_dict, n, missing=any_missing) else: G_dict, any_missing = _sparsify_haplotype_matrix(Gs) Counts = spt.count_haplotypes_sparse(G_dict, n, missing=any_missing) if genotypes: D = gcs.compute_D(Counts) D2 = gcs.compute_D2(Counts) Dz = gcs.compute_Dz(Counts) pi2 = gcs.compute_pi2(Counts) else: D = shc.D(Counts.T) D2 = shc.DD([Counts.T], [0, 0]) Dz = shc.Dz([Counts.T], [0, 0, 0]) pi2 = shc.pi2([Counts.T], [0, 0, 0, 0]) return D2, Dz, pi2, D
[docs]def compute_average_stats(Gs, genotypes=True): """ Takes the outputs of ``compute_pairwise_stats`` and returns the average value for each statistic. :param Gs: A genotype matrix, of size L-by-n, where L is the number of loci and n is the sample size. Missing data is encoded as -1. :param genotypes: If True, use 0, 1, 2 genotypes. If False, use 0, 1 phased haplotypes. """ D2, Dz, pi2, D = compute_pairwise_stats(Gs, genotypes=True) return np.mean(D2), np.mean(Dz), np.mean(pi2), np.mean(D)
[docs]def compute_pairwise_stats_between(Gs1, Gs2, genotypes=True): """ Computes :math:`D^2`, :math:`Dz`, :math:`\\pi_2`, and :math:`D` for every pair of loci between two blocks of SNPs, coded as genotype matrices. The Gs are matrices, where rows correspond to loci and columns to individuals. Both matrices must have the same number of individuals. If Gs1 has length L1 and Gs2 has length L2, we compute all pairwise counts, which has size (L1*L2, 9). We use the sparse genotype matrix representation, where we first "sparsify" the genotype matrix, and then count two-locus genotype configurations from that, from which we compute two-locus statistics :param Gs1: A genotype matrices, of size L1 by n, where L1 is the number of loci and n is the sample size. Missing data is encoded as -1. :param Gs2: A genotype matrices, of size L2 by n, where L1 is the number of loci and n is the sample size. Missing data is encoded as -1. :param genotypes: If True, use 0, 1, 2 genotypes. If False, use 0, 1 phased haplotypes. """ assert ( ld_extensions == 1 ), "Need to build LD cython extensions. Install moments with the flag `--ld_extensions`" _check_valid_genotype_matrix(Gs1, genotypes) _check_valid_genotype_matrix(Gs2, genotypes) L1, n1 = np.shape(Gs1) L2, n2 = np.shape(Gs1) if n1 != n2: raise ValueError("data must have same number of sequenced individuals") else: n = n1 if genotypes: G_dict1, any_missing1 = _sparsify_genotype_matrix(Gs1) G_dict2, any_missing2 = _sparsify_genotype_matrix(Gs2) else: G_dict1, any_missing1 = _sparsify_haplotype_matrix(Gs1) G_dict2, any_missing2 = _sparsify_haplotype_matrix(Gs2) any_missing = np.logical_or(any_missing1, any_missing2) if genotypes: Counts = spt.count_genotypes_between_sparse( G_dict1, G_dict2, n, missing=any_missing ) else: Counts = spt.count_haplotypes_between_sparse( G_dict1, G_dict2, n, missing=any_missing ) if genotypes: D2 = gcs.compute_D2(Counts) Dz = gcs.compute_Dz(Counts) pi2 = gcs.compute_pi2(Counts) D = gcs.compute_D(Counts) else: D = shc.D(Counts.T) D2 = shc.DD([Counts.T], [0, 0]) Dz = shc.Dz([Counts.T], [0, 0, 0]) pi2 = shc.pi2([Counts.T], [0, 0, 0, 0]) return D2, Dz, pi2, D
[docs]def compute_average_stats_between(Gs1, Gs2, genotypes=True): """ Takes the outputs of compute_pairwise_stats_between and returns the average value for each statistic. :param Gs1: A genotype matrices, of size L1 by n, where L1 is the number of loci and n is the sample size. Missing data is encoded as -1. :param Gs2: A genotype matrices, of size L2 by n, where L1 is the number of loci and n is the sample size. Missing data is encoded as -1. """ D2, Dz, pi2, D = compute_pairwise_stats_between(Gs1, Gs2, genotypes=genotypes) return np.mean(D2), np.mean(Dz), np.mean(pi2), np.mean(D)
def _count_types_sparse( genotypes, bins, sample_ids, positions=None, pos_rs=None, pop_file=None, pops=None, use_genotypes=True, report=True, report_spacing=1000, use_cache=True, stats_to_compute=None, normalized_by=None, ): assert ( ld_extensions == 1 ), "Need to build LD cython extensions. Install moments with the flag `--ld_extensions`" pop_indexes = {} if pops is not None: ## get columns to keep, and compress data and sample_ids samples = pandas.read_csv(pop_file, delim_whitespace=True) cols_to_keep = np.array([False] * np.shape(genotypes)[1]) all_samples_to_keep = [] for pop in pops: all_samples_to_keep += list(samples[samples["pop"] == pop]["sample"]) sample_list = list(sample_ids) for s in all_samples_to_keep: cols_to_keep[sample_list.index(s)] = True genotypes_pops = genotypes.compress(cols_to_keep, axis=1) sample_ids_pops = list(np.array(sample_list).compress(cols_to_keep)) ## keep only biallelic genotypes from populations in pops, discard the rest allele_counts_pops = genotypes_pops.count_alleles() is_biallelic = allele_counts_pops.is_biallelic_01() genotypes_pops = genotypes_pops.compress(is_biallelic) ## for each population, get the indexes for each population for pop in pops: pop_indexes[pop] = np.array([False] * np.shape(genotypes_pops)[1]) for s in samples[samples["pop"] == pop]["sample"]: pop_indexes[pop][sample_ids_pops.index(s)] = True if use_genotypes == False: pop_indexes_haps = {} for pop in pops: pop_indexes_haps[pop] = np.reshape( list(zip(pop_indexes[pop], pop_indexes[pop])), (2 * len(pop_indexes[pop]),), ) if positions is not None: positions = positions.compress(is_biallelic) if pos_rs is not None: pos_rs = pos_rs.compress(is_biallelic) else: if report == True: print("No populations given, using all samples as one population.") sys.stdout.flush() pops = ["ALL"] pop_indexes["ALL"] = np.array([True] * np.shape(genotypes)[1]) genotypes_pops = genotypes if use_genotypes == False: pop_indexes_haps = {} for pop in pops: pop_indexes_haps[pop] = np.reshape( list(zip(pop_indexes[pop], pop_indexes[pop])), (2 * len(pop_indexes[pop]),), ) ## convert to 0,1,2 format if use_genotypes == True: genotypes_pops_012 = genotypes_pops.to_n_alt() else: try: haplotypes_pops_01 = genotypes_pops.to_haplotypes() except AttributeError: print( "warning: attempted to get haplotypes from phased genotypes, returned attribute error. Using input as haplotypes." ) haplotypes_pops_01 = genotypes_pops if pos_rs is not None: rs = pos_rs elif positions is not None: rs = positions ns = {} for pop in pops: if use_genotypes == True: ns[pop] = sum(pop_indexes[pop]) else: ns[pop] = 2 * sum(pop_indexes[pop]) bins = np.array(bins) ## split and sparsify the geno/haplo-type arrays for each population if use_genotypes == True: genotypes_by_pop = {} any_missing = False for pop in pops: temp_genotypes = genotypes_pops_012.compress(pop_indexes[pop], axis=1) genotypes_by_pop[pop], this_missing = _sparsify_genotype_matrix( temp_genotypes ) any_missing = np.logical_or(any_missing, this_missing) else: haplotypes_by_pop = {} any_missing = False for pop in pops: temp_haplotypes = haplotypes_pops_01.compress(pop_indexes_haps[pop], axis=1) haplotypes_by_pop[pop], this_missing = _sparsify_haplotype_matrix( temp_haplotypes ) any_missing = np.logical_or(any_missing, this_missing) # if use_cache == True: # run loop that computes type_counts cache # else # run loop that adds to sums ## if use_cache is True, type_counts will store the number of times we ## see each genotype count configuration within each bin ## if use_cache is False, we add to the running total of sums of each ## statistic as we count their genotypes, never storing the counts of configurations bs = list(zip(bins[:-1], bins[1:])) if use_cache == True: type_counts = {} for b in bs: type_counts[b] = defaultdict(int) else: sums = {} for b in bs: sums[b] = {} for stat in stats_to_compute[0]: sums[b][stat] = 0 ## loop through left positions and pair with positions to the right ## within the bin windows ## this is a very inefficient, naive approach for ii, r in enumerate(rs[:-1]): if report is True: if ii % report_spacing == 0: print( "tallied two locus counts {0} of {1} positions".format(ii, len(rs)) ) sys.stdout.flush() ## loop through each bin, picking out the positions to the right ## of the left locus that fall within the given bin if pos_rs is not None: distances = pos_rs - r else: distances = positions - r filt = np.logical_and( np.logical_and(distances >= bs[0][0], distances < bs[-1][1]), positions != positions[ii], ) filt[ii] = False # don't compare to mutations at same base pair position right_indices = np.where(filt == True)[0] ## if there are no variants within the bin's distance to the right, ## continue to next bin if len(right_indices) == 0: continue right_start = right_indices[0] right_end = right_indices[-1] + 1 if use_cache == False: counts_ii = {} for b in bs: counts_ii[b] = [[] for pop_ind in range(len(pops))] ## loop through right loci and count two-locus genotypes for jj in range(right_start, right_end): # get the bin that this pair belongs to r_dist = distances[jj] bin_ind = np.where(r_dist >= bins)[0][-1] b = bs[bin_ind] # count genotypes within each population if use_genotypes == True: cs = tuple( [ spt.tally_sparse( genotypes_by_pop[pop][ii], genotypes_by_pop[pop][jj], ns[pop], any_missing, ) for pop in pops ] ) else: cs = tuple( [ spt.tally_sparse_haplotypes( haplotypes_by_pop[pop][ii], haplotypes_by_pop[pop][jj], ns[pop], any_missing, ) for pop in pops ] ) if use_cache == True: type_counts[b][cs] += 1 else: for pop_ind in range(len(pops)): counts_ii[b][pop_ind].append(cs[pop_ind]) if use_cache == False: for b in bs: these_counts = np.array(counts_ii[b]) if these_counts.shape[1] == 0: continue for stat in stats_to_compute[0]: sums[b][stat] += _call_sgc( stat, these_counts.swapaxes(1, 2), use_genotypes ).sum() if use_cache == True: return type_counts else: return sums def _call_sgc(stat, Cs, use_genotypes=True): """ stat = 'DD', 'Dz', or 'pi2', with underscore indices (like 'DD_1_1') Cs = L \times n array, L number of count configurations, n = 4 or 9 (for haplotypes or genotypes) """ assert ( ld_extensions == 1 ), "Need to build LD cython extensions. Install moments with the flag `--ld_extensions`" s = stat.split("_")[0] pop_nums = [int(p) for p in stat.split("_")[1:]] if Cs.__class__ != np.ndarray: raise ValueError("Cs expected to be a numpy array, got", Cs.__class__) if Cs.dtype != np.int64: Cs = Cs.astype(np.int64) if s == "DD": if use_genotypes == True: return gcs_mp.DD(Cs, pop_nums) else: return shc.DD(Cs, pop_nums) if s == "Dz": ii, jj, kk = pop_nums if jj == kk: if use_genotypes == True: return gcs_mp.Dz(Cs, pop_nums) else: return shc.Dz(Cs, pop_nums) else: if use_genotypes == True: return 1.0 / 2 * gcs_mp.Dz(Cs, [ii, jj, kk]) + 1.0 / 2 * gcs_mp.Dz( Cs, [ii, kk, jj] ) else: return 1.0 / 2 * shc.Dz(Cs, [ii, jj, kk]) + 1.0 / 2 * shc.Dz( Cs, [ii, kk, jj] ) if s == "pi2": ( ii, jj, kk, ll, ) = pop_nums ### this doesn't consider the symmetry between p/q yet... if ii == jj: if kk == ll: if ii == kk: # all the same if use_genotypes == True: return gcs_mp.pi2(Cs, [ii, jj, kk, ll]) else: return shc.pi2(Cs, [ii, jj, kk, ll]) else: # (1, 1; 2, 2) if use_genotypes == True: return ( 1.0 / 2 * ( gcs_mp.pi2(Cs, [ii, jj, kk, ll]) + gcs_mp.pi2(Cs, [kk, ll, ii, jj]) ) ) else: return ( 1.0 / 2 * ( shc.pi2(Cs, [ii, jj, kk, ll]) + shc.pi2(Cs, [kk, ll, ii, jj]) ) ) else: # (1, 1; 2, 3) or (1, 1; 1, 2) if use_genotypes == True: return ( 1.0 / 4 * ( gcs_mp.pi2(Cs, [ii, jj, kk, ll]) + gcs_mp.pi2(Cs, [ii, jj, ll, kk]) + gcs_mp.pi2(Cs, [kk, ll, ii, jj]) + gcs_mp.pi2(Cs, [ll, kk, ii, jj]) ) ) else: return ( 1.0 / 4 * ( shc.pi2(Cs, [ii, jj, kk, ll]) + shc.pi2(Cs, [ii, jj, ll, kk]) + shc.pi2(Cs, [kk, ll, ii, jj]) + shc.pi2(Cs, [ll, kk, ii, jj]) ) ) else: if kk == ll: # (1, 2; 3, 3) or (1, 2; 2, 2) if use_genotypes == True: return ( 1.0 / 4 * ( gcs_mp.pi2(Cs, [ii, jj, kk, ll]) + gcs_mp.pi2(Cs, [jj, ii, kk, ll]) + gcs_mp.pi2(Cs, [kk, ll, ii, jj]) + gcs_mp.pi2(Cs, [kk, ll, jj, ii]) ) ) else: return ( 1.0 / 4 * ( shc.pi2(Cs, [ii, jj, kk, ll]) + shc.pi2(Cs, [jj, ii, kk, ll]) + shc.pi2(Cs, [kk, ll, ii, jj]) + shc.pi2(Cs, [kk, ll, jj, ii]) ) ) else: # (1, 2; 3, 4) if use_genotypes == True: return ( 1.0 / 8 * ( gcs_mp.pi2(Cs, [ii, jj, kk, ll]) + gcs_mp.pi2(Cs, [ii, jj, ll, kk]) + gcs_mp.pi2(Cs, [jj, ii, kk, ll]) + gcs_mp.pi2(Cs, [jj, ii, ll, kk]) + gcs_mp.pi2(Cs, [kk, ll, ii, jj]) + gcs_mp.pi2(Cs, [ll, kk, ii, jj]) + gcs_mp.pi2(Cs, [kk, ll, jj, ii]) + gcs_mp.pi2(Cs, [ll, kk, jj, ii]) ) ) else: return ( 1.0 / 8 * ( shc.pi2(Cs, [ii, jj, kk, ll]) + shc.pi2(Cs, [ii, jj, ll, kk]) + shc.pi2(Cs, [jj, ii, kk, ll]) + shc.pi2(Cs, [jj, ii, ll, kk]) + shc.pi2(Cs, [kk, ll, ii, jj]) + shc.pi2(Cs, [ll, kk, ii, jj]) + shc.pi2(Cs, [kk, ll, jj, ii]) + shc.pi2(Cs, [ll, kk, jj, ii]) ) ) def _cache_ld_statistics(type_counts, ld_stats, bins, use_genotypes=True, report=True): ### This function might not be needed anymore if we completely remove caching bs = list(zip(bins[:-1], bins[1:])) estimates = {} for b in bs: for cs in type_counts[b].keys(): estimates.setdefault(cs, {}) all_counts = np.array(list(estimates.keys())) all_counts = np.swapaxes(all_counts, 0, 1) all_counts = np.swapaxes(all_counts, 1, 2) for stat in ld_stats: if report is True: print("computing " + stat) sys.stdout.flush() vals = _call_sgc(stat, all_counts, use_genotypes) for ii in range(len(all_counts[0, 0])): cs = all_counts[:, :, ii] estimates[tuple([tuple(c) for c in cs])][stat] = vals[ii] return estimates def _get_ld_stat_sums(type_counts, ld_stats, bins, use_genotypes=True, report=True): """ return sums[b][stat] """ ### this is super inefficient, just trying to get around memory issues bs = list(zip(bins[:-1], bins[1:])) sums = {} if use_genotypes is True: empty_genotypes = tuple([0] * 9) else: empty_genotypes = tuple([0] * 4) for stat in ld_stats: if report is True: print("computing " + stat) sys.stdout.flush() # set counts of non-used stats to zeros, then take set pops_in_stat = sorted(list(set(int(p) for p in stat.split("_")[1:]))) stat_counts = {} for b in bs: for cs in type_counts[b].keys(): this_count = list(cs) for i in range(len(cs)): if i not in pops_in_stat: this_count[i] = empty_genotypes stat_counts.setdefault(tuple(this_count), defaultdict(int)) stat_counts[tuple(this_count)][b] += type_counts[b][cs] all_counts = np.array(list(stat_counts.keys())) all_counts = np.swapaxes(all_counts, 0, 1) all_counts = np.swapaxes(all_counts, 1, 2) vals = _call_sgc(stat, all_counts, use_genotypes) estimates = {} for v, ii in zip(vals, range(len(all_counts[0, 0]))): cs = tuple(tuple(c) for c in all_counts[:, :, ii]) estimates[cs] = v for b in bs: sums.setdefault(b, {}) sums[b][stat] = 0 for cs in stat_counts: sums[b][stat] += stat_counts[cs][b] * estimates[cs] return sums def _get_H_statistics( genotypes, sample_ids, pop_file=None, pops=None, ac_filter=True, report=True ): """ Het values are not normalized by sequence length, would need to compute L from bed file. """ if pop_file == None and pops == None: if report == True: print( "No population file or population names given, " "assuming all samples as single pop." ) sys.stdout.flush() elif pops == None: raise ValueError("pop_file given, but not population names...") sys.stdout.flush() elif pop_file == None: raise ValueError("Population names given, but not pop_file...") sys.stdout.flush() if pops == None: pops = ["ALL"] if pop_file is not None: samples = pandas.read_csv(pop_file, delim_whitespace=True) populations = np.array(samples["pop"].value_counts().keys()) samples.reset_index(drop=True, inplace=True) ### should use this above when counting two locus genotypes sample_ids_list = list(sample_ids) subpops = { # for each population, get the list of samples that # belong to the population pop_iter: [ sample_ids_list.index(ind) for ind in samples[samples["pop"] == pop_iter]["sample"] ] for pop_iter in pops } ac_subpop = genotypes.count_alleles_subpops(subpops) else: subpops = {pop_iter: list(range(len(sample_ids))) for pop_iter in pops} ac_subpop = genotypes.count_alleles_subpops(subpops) # ensure at least 2 allele counts per pop min_ac_filter = [True] * len(ac_subpop) if ac_filter == True: for pop in pops: min_ac_filter = np.logical_and( min_ac_filter, np.sum(ac_subpop[pop], axis=1) >= 2 ) ac_subpop_filt = {} for pop in pops: ac_subpop_filt[pop] = np.array(ac_subpop[pop]).compress(min_ac_filter, axis=0) Hs = {} for ii, pop1 in enumerate(pops): for pop2 in pops[ii:]: if pop1 == pop2: H = np.sum( 2.0 * ac_subpop_filt[pop1][:, 0] * ac_subpop_filt[pop1][:, 1] / (ac_subpop_filt[pop1][:, 0] + ac_subpop_filt[pop1][:, 1]) / (ac_subpop_filt[pop1][:, 0] + ac_subpop_filt[pop1][:, 1] - 1) ) else: H = np.sum( 1.0 * ac_subpop_filt[pop1][:, 0] * ac_subpop_filt[pop2][:, 1] / (ac_subpop_filt[pop1][:, 0] + ac_subpop_filt[pop1][:, 1]) / (ac_subpop_filt[pop2][:, 0] + ac_subpop_filt[pop2][:, 1]) + 1.0 * ac_subpop_filt[pop1][:, 1] * ac_subpop_filt[pop2][:, 0] / (ac_subpop_filt[pop1][:, 0] + ac_subpop_filt[pop1][:, 1]) / (ac_subpop_filt[pop2][:, 0] + ac_subpop_filt[pop2][:, 1]) ) Hs[(pop1, pop2)] = H return Hs def _get_reported_stats( genotypes, bins, sample_ids, positions=None, pos_rs=None, pop_file=None, pops=None, use_genotypes=True, report=True, report_spacing=1000, use_cache=True, stats_to_compute=None, ac_filter=True, ): ### build wrapping function that can take use_cache = True or False # now if bins is empty, we only return heterozygosity statistics if stats_to_compute == None: if pops is None: stats_to_compute = Util.moment_names(1) else: stats_to_compute = Util.moment_names(len(pops)) bs = list(zip(bins[:-1], bins[1:])) if use_cache == True: type_counts = _count_types_sparse( genotypes, bins, sample_ids, positions=positions, pos_rs=pos_rs, pop_file=pop_file, pops=pops, use_genotypes=use_genotypes, report=report, report_spacing=report_spacing, use_cache=use_cache, ) sums = _get_ld_stat_sums( type_counts, stats_to_compute[0], bins, use_genotypes=use_genotypes, report=report, ) else: sums = _count_types_sparse( genotypes, bins, sample_ids, positions=positions, pos_rs=pos_rs, pop_file=pop_file, pops=pops, use_genotypes=use_genotypes, report=report, report_spacing=report_spacing, use_cache=use_cache, stats_to_compute=stats_to_compute, ) if report is True: print("computed sums\ngetting heterozygosity statistics") sys.stdout.flush() if len(stats_to_compute[1]) == 0: Hs = {} else: Hs = _get_H_statistics( genotypes, sample_ids, pop_file=pop_file, pops=pops, ac_filter=ac_filter, report=report, ) reported_stats = {} reported_stats["bins"] = bs reported_stats["sums"] = [np.empty(len(stats_to_compute[0])) for b in bs] + [ np.empty(len(stats_to_compute[1])) ] for ii, b in enumerate(bs): for s in stats_to_compute[0]: reported_stats["sums"][ii][stats_to_compute[0].index(s)] = sums[b][s] if pops == None: pops = ["ALL"] for s in stats_to_compute[1]: reported_stats["sums"][-1][stats_to_compute[1].index(s)] = Hs[ (pops[int(s.split("_")[1])], pops[int(s.split("_")[2])]) ] reported_stats["stats"] = stats_to_compute reported_stats["pops"] = pops return reported_stats
[docs]def compute_ld_statistics( vcf_file, bed_file=None, chromosome=None, rec_map_file=None, map_name=None, map_sep=None, pop_file=None, pops=None, cM=True, r_bins=None, bp_bins=None, min_bp=None, use_genotypes=True, use_h5=True, stats_to_compute=None, ac_filter=True, report=True, report_spacing=1000, use_cache=True, ): """ Computes LD statistics for a given VCF. Binning can be done by base pair or recombination distances, the latter requiring a recombination map. For more than one population, we include a population file that maps samples to populations, and specify with populations to compute statistics fro. If data is phased, we can set ``use_genotypes`` to False, and there are other options for masking data. .. note:: Currently, the recombination map is not given in HapMap format. Future versions will accept HapMap formatted recombination maps and deprecate some of the boutique handling of map options here. :param str vcf_file: The input VCF file name. :param str bed_file: An optional bed file that specifies regions over which to compute LD statistics. If None, computes statistics for all positions in VCF. :param str chromosome: If None, treats all positions in VCF as coming from same chromosome. If multiple chromosomes are reported in the same VCF, we need to specify which chromosome to keep variants from. :param str rec_map_file: The input recombination map. The format is {pos}\t{map (cM)}\t{additional maps}\n :param str map_name: If None, takes the first map column, otherwise takes the specified map column with the name matching the recombination map file header. :param str map_sep: Deprecated! We now read the recombination map, splitting by any white space. Previous behaviour: Tells pandas how to parse the recombination map. :param str pop_file: A file the specifies the population for each sample in the VCF. Each sample is listed on its own line, in the format "{sample}\t{pop}". The first line must be "sample\tpop". :param list(str) pops: List of populations to compute statistics for. If none are given, it treates every sample as coming from the same population. :param bool cM: If True, the recombination map is specified in cM. If False, the map is given in units of Morgans. :param list(float) r_bins: A list of raw recombination rate bin edges. :param list(float) bp_bins: If ``r_bins`` are not given, a list of bp bin edges (for use when no recombination map is specified). :param int, float min_bp: The minimum bp allowed for a segment specified by the bed file. :param bool use_genotypes: If True, we assume the data in the VCF is unphased. Otherwise, we use phased information. :param bool use_h5: If True, we use the h5 format. :param list stats_to_compute: If given, we compute only the statistics specified. Otherwise, we compute all possible statistics for the populations given. :param ac_filter: Ensure at least two samples are present per population. This prevents computed heterozygosity statistics from returning NaN when some loci have too few called samples. :param bool report: If True, we report the progress of our parsing. :param int report_spacing: We track the number of "left" variants we compute, and report our progress with the given spacing. :param bool use_cache: If True, cache intermediate results. """ check_imports() if r_bins is None and bp_bins is None: warnings.warn( "Both r_bins and bp_pins are None, so no LD statistics will be computed" ) bins = [] elif r_bins is not None: if bp_bins is not None: raise ValueError("Can specify only recombination or bp bins, not both") positions, genotypes, counts, sample_ids = get_genotypes( vcf_file, bed_file=bed_file, chromosome=chromosome, min_bp=min_bp, use_h5=use_h5, report=report, ) if report == True: print("kept {0} total variants".format(len(positions))) sys.stdout.flush() if rec_map_file is not None and r_bins is not None: if report is True: print("assigning recombination rates to positions") sys.stdout.flush() pos_rs = _assign_recombination_rates( positions, rec_map_file, map_name=map_name, cM=cM, report=report ) bins = r_bins else: if report is True: print("no recombination map provided, using physical distance") sys.stdout.flush() pos_rs = None if bp_bins is not None: bins = bp_bins else: bins = [] if not np.all([b1 - b0 > 0 for b0, b1 in zip(bins[:-1], bins[1:])]): raise ValueError("bins must be a monotonically increasing list") reported_stats = _get_reported_stats( genotypes, bins, sample_ids, positions=positions, pos_rs=pos_rs, pop_file=pop_file, pops=pops, use_genotypes=use_genotypes, report=report, stats_to_compute=stats_to_compute, report_spacing=report_spacing, use_cache=use_cache, ac_filter=ac_filter, ) return reported_stats
[docs]def means_from_region_data(all_data, stats, norm_idx=0): """ Get means over all parsed regions. :param dict all_data: A dictionary with keys as unique identifiers of the regions and values as reported stats from ``compute_ld_statistics``. :param list of lists stats: The list of LD and H statistics that are present in the data replicates. :param int, optional norm_idx: The index of the population to normalize by. """ norm_stats = ["pi2_{0}_{0}_{0}_{0}".format(norm_idx), "H_{0}_{0}".format(norm_idx)] means = [0 * sums for sums in all_data[list(all_data.keys())[0]]["sums"]] for reg in all_data.keys(): for ii in range(len(means)): means[ii] += all_data[reg]["sums"][ii] for ii in range(len(means) - 1): means[ii] /= means[ii][stats[0].index(norm_stats[0])] means[-1] /= means[-1][stats[1].index(norm_stats[1])] return means
[docs]def get_bootstrap_sets( all_data, num_bootstraps=None, normalization=0, remove_norm_stats=True ): """ From a dictionary of all the regional data, resample with replacement to construct bootstrap data. Returns a list of bootstrapped datasets of mean statistics. :param dict all_data: Dictionary of regional LD statistics. Keys are region identifiers and must be unique, and the items are the outputs of ``compute_ld_statistics``. :param int num_bootstraps: The number of bootstrap replicates to compute. If None, it computes the same number as the nubmer of regions in ``all_data``. :param int normalization: The index of the population to normalize by. Defaults to 0. """ regions = list(all_data.keys()) reg = regions[0] stats = all_data[reg]["stats"] num_regions = len(all_data) if num_bootstraps is None: num_bootstraps = num_regions all_boot = [] for rep in range(num_bootstraps): temp_data = {} choices = np.random.choice(regions, num_regions, replace=True) for i, c in enumerate(choices): temp_data[i] = all_data[c] boot_means = means_from_region_data(temp_data, stats, normalization) if remove_norm_stats: for ii in range(len(boot_means)): if ii < len(boot_means) - 1: delete_idx = stats[0].index( "pi2_{0}_{0}_{0}_{0}".format(normalization) ) else: delete_idx = stats[1].index("H_{0}_{0}".format(normalization)) boot_means[ii] = np.delete(boot_means[ii], delete_idx) all_boot.append(boot_means) return all_boot
[docs]def bootstrap_data(all_data, normalization=0): """ Returns bootstrapped variances for LD statistics. This function operates on data that is sums (i.e. the direct output of ``compute_ld_statistics()``), instead of mean statistics. We first check that all 'stats', 'bins', 'pops' (if present), match across all regions If there are N total regions, we compute N bootstrap replicates by sampling N times with replacement and summing over all 'sums'. :param dict all_data: A dictionary (with arbitrary keys), where each value is LD statistics computed from a distinct region. all_data[reg] stats from each region has keys, 'bins', 'sums', 'stats', and optional 'pops'. :param int normalization: we work with :math:`\\sigma_d^2` statistics, and by default we use population 0 to normalize stats """ # check that var-cov matrix will be able to be computed k = list(all_data.keys())[0] for v in all_data[k]["sums"]: if len(all_data) < len(v): raise ValueError( "There are not enough independent regions to compute " "variance-covariance matrix" ) norm_stats = [ "pi2_{0}_{0}_{0}_{0}".format(normalization), "H_{0}_{0}".format(normalization), ] regions = list(all_data.keys()) reg = regions[0] stats = all_data[reg]["stats"] N = len(regions) means = means_from_region_data(all_data, stats, normalization) # construct bootstrap data bootstrap_data = [np.zeros((len(sums), N)) for sums in means] for boot_num in range(N): boot_means = [0 * sums for sums in means] samples = np.random.choice(regions, N) for reg in samples: for ii in range(len(boot_means)): boot_means[ii] += all_data[reg]["sums"][ii] for ii in range(len(boot_means) - 1): boot_means[ii] /= boot_means[ii][stats[0].index(norm_stats[0])] boot_means[-1] /= boot_means[-1][stats[1].index(norm_stats[1])] for ii in range(len(boot_means)): bootstrap_data[ii][:, boot_num] = boot_means[ii] varcovs = [np.cov(bootstrap_data[ii]) for ii in range(len(bootstrap_data))] mv = {} mv["bins"] = all_data[reg]["bins"] mv["stats"] = all_data[reg]["stats"] if "pops" in all_data[reg]: mv["pops"] = all_data[reg]["pops"] mv["means"] = means mv["varcovs"] = varcovs return mv
[docs]def subset_data( data, pops_to, normalization=0, r_min=None, r_max=None, remove_Dz=False ): """ Take the output data and get r_edges, ms, vcs, and stats to pass to inference machinery. ``pops_to`` are the subset of the populations to marginalize the data to. ``r_min`` and ``r_max`` trim bins that fall outside of this range, and ``remove_Dz`` allows us to remove all :math:`\\sigma_{Dz}` statistics. :param data: The output of ``bootstrap_data``, which contains bins, statistics, populations, means, and variance-covariance matrices. :param pops_to: A list of populations to subset to. :param normalization: The population index that the original data was normalized by. :param r_min: The minimum recombination distance to keep. :param r_max: The maximum recombination distance to keep. :param remove_Dz: If True, remove all Dz statistics. Otherwise keep them. """ pops_from = data["pops"] if not np.all([p in pops_from for p in pops_to]): raise ValueError("All pops in pops_to must be in data") new_pop_ids = {} for pop in pops_to: new_pop_ids[pops_from.index(pop)] = pops_to.index(pop) stats = data["stats"] to_remove = [[], []] new_stats = [[], []] for j in [0, 1]: for i, stat in enumerate(stats[j]): if stat in [ "pi2_{0}_{0}_{0}_{0}".format(normalization), "H_{0}_{0}".format(normalization), ]: to_remove[j].append(i) else: if remove_Dz == True: if stat.split("_")[0] == "Dz": to_remove[j].append(i) continue p_inds = [int(x) for x in stat.split("_")[1:]] if len(set(p_inds) - set(new_pop_ids)) == 0: new_stat = "_".join( [stat.split("_")[0]] + [str(new_pop_ids[x]) for x in p_inds] ) new_stats[j].append(new_stat) else: to_remove[j].append(i) means = [] varcovs = [] for i, b in enumerate(data["bins"]): if r_min is not None: if b[0] < r_min: continue if r_max is not None: if b[1] > r_max: continue means.append(np.delete(data["means"][i], to_remove[0])) varcovs.append( np.delete( np.delete(data["varcovs"][i], to_remove[0], axis=0), to_remove[0], axis=1, ) ) means.append(np.delete(data["means"][-1], to_remove[1])) varcovs.append( np.delete( np.delete(data["varcovs"][-1], to_remove[1], axis=0), to_remove[1], axis=1 ) ) r_edges = np.array(sorted(list(set(np.array(data["bins"]).flatten())))) if r_min is not None: r_edges = r_edges[r_edges >= r_min] if r_max is not None: r_edges = r_edges[r_edges <= r_max] return r_edges, means, varcovs, new_stats