arg_needle_lib

class arg_needle_lib.ARG

Bases: pybind11_object

add_sample(self: arg_needle_lib_pybind.ARG, sample_name: str = '') int
check_basic(self: arg_needle_lib_pybind.ARG, stringent: bool = True) None

Basic checks after initializing an ARG

check_children(self: arg_needle_lib_pybind.ARG, stringent: bool = True) None

Checks after populating children

check_roots(self: arg_needle_lib_pybind.ARG) None

Checks after populating roots

get_id_of_closest_site(self: arg_needle_lib_pybind.ARG, position: float) int
get_idx_of_first_mutation_left_of(self: arg_needle_lib_pybind.ARG, physical_pos: float, include_equal: bool = False, warn_out_of_range: bool = True) int
get_idx_of_first_mutation_right_of(self: arg_needle_lib_pybind.ARG, physical_pos: float, include_equal: bool = False, warn_out_of_range: bool = True) int
get_idx_of_mutation_closest_to(self: arg_needle_lib_pybind.ARG, physical_pos: float) int
get_mutation_sites(self: arg_needle_lib_pybind.ARG) Dict[float, arg_needle_lib_pybind.Site]
get_site(self: arg_needle_lib_pybind.ARG, site_id: int) float
get_sites(self: arg_needle_lib_pybind.ARG) List[float]
is_leaf(self: arg_needle_lib_pybind.ARG, node_id: int) bool
lowest_mutated_edge(self: arg_needle_lib_pybind.ARG, arg0: int, arg1: float) arg_needle_lib_pybind.ARGEdge
lowest_mutated_edge_by_site(self: arg_needle_lib_pybind.ARG, arg0: int, arg1: int) arg_needle_lib_pybind.ARGEdge
mrca(self: arg_needle_lib_pybind.ARG, ID1: int, ID2: int, position: float) arg_needle_lib_pybind.ARGNode
mutations(self: arg_needle_lib_pybind.ARG) List[arg_needle_lib_pybind.Mutation]

Return list of all Mutation objects

node(self: arg_needle_lib_pybind.ARG, ID: int) arg_needle_lib_pybind.ARGNode

Get node at ID

node_ids(self: arg_needle_lib_pybind.ARG) List[int]

Return list of all node IDs

num_edges(self: arg_needle_lib_pybind.ARG) int
num_mutations(self: arg_needle_lib_pybind.ARG) int
num_nodes(self: arg_needle_lib_pybind.ARG) int
num_samples(self: arg_needle_lib_pybind.ARG) int

Return number of samples

num_sites(self: arg_needle_lib_pybind.ARG) int
populate_children_and_roots(self: arg_needle_lib_pybind.ARG) None
populate_mutations_on_edges(self: arg_needle_lib_pybind.ARG) None
root_at(self: arg_needle_lib_pybind.ARG, position: float) arg_needle_lib_pybind.Root
root_starts(self: arg_needle_lib_pybind.ARG) List[float]
set_chromosome(self: arg_needle_lib_pybind.ARG, chromosome: int) None
set_offset(self: arg_needle_lib_pybind.ARG, offset: int) None
set_sites(self: arg_needle_lib_pybind.ARG, positions: List[float]) None
thread_sample(self: arg_needle_lib_pybind.ARG, section_starts: List[float], sample_ids: List[int], heights: List[float]) None
property chromosome
property end
property leaf_ids
property offset
property reserved_samples
property sample_names
property start
property threaded_samples
class arg_needle_lib.ARGEdge

Bases: pybind11_object

mutations(self: arg_needle_lib_pybind.ARGEdge) List[Mutation]

Return list of all mutation objects

property child
property end
property parent
property start
class arg_needle_lib.ARGNode

Bases: pybind11_object

add_parent(self: arg_needle_lib_pybind.ARGNode, start: float, end: float, parent: arg_needle_lib_pybind.ARGNode) ARGEdge
parent_edge_at(self: arg_needle_lib_pybind.ARGNode, position: float) ARGEdge
parent_edges(self: arg_needle_lib_pybind.ARGNode) List[ARGEdge]

Returns a sorted list of parent edges

parent_starts(self: arg_needle_lib_pybind.ARGNode) List[float]

Returns a sorted list of parent starts

remove_parent(self: arg_needle_lib_pybind.ARGNode, start: float) None
update_parent_end(self: arg_needle_lib_pybind.ARGNode, start: float, end_new: float) None
update_parent_start(self: arg_needle_lib_pybind.ARGNode, start_old: float, start_new: float) None
property ID
property end
property height
property start
class arg_needle_lib.DescendantList

Bases: pybind11_object

static print_threshold() None

Print threshold

static set_threshold(threshold: int) None

Set threshold

class arg_needle_lib.Mutation

Bases: pybind11_object

property edge
property height
property position
property site_id
class arg_needle_lib.Root

Bases: pybind11_object

property end
property node
property start
class arg_needle_lib.Site

Bases: pybind11_object

get_mutations(self: arg_needle_lib_pybind.Site) List[arg_needle_lib_pybind.Mutation]
get_position(self: arg_needle_lib_pybind.Site) float
arg_needle_lib.arg_to_newick(arg: arg_needle_lib_pybind.ARG, verbose: bool = False) str

Return a Newick representation of an ARG, verbose includes branch lengths

arg_needle_lib.arg_to_tskit(arg, batch_size=None, mutations=True, sample_permutation=None)

Convert arg_needle_lib ARG object to tskit.TreeSequence using tables

For this to work properly, the node IDs in the ARG object must all be consecutive.

Parameters:
  • arg – arg_needle_lib.ARG object to convert

  • batch_size (int) – Maximum allowed size for edge list information mainly used to save memory (default is no batching)

  • mutations (bool) – If True, converts mutations as well (default=True)

  • sample_permutation – If not None, then should be a permutation of values from 0 to num_samples - 1, either as a numpy array or Python list. Sample i in the resulting tskit.TreeSequence will correspond to sample sample_permutation[i] in the arg_needle_lib.ARG.

Returns:

Converted tskit.TreeSequence object.

Raises:

ValueError – Error in ARG format, batch_size parameter, or sample_permutation.

arg_needle_lib.association_diploid_all(arg: arg_needle_lib_pybind.ARG, phenotype: List[float], use_sample: List[bool], file_root: str, chromosome: int = 1, snp_prefix: str = '', min_maf: float = -1, max_maf: float = -1, write_bitset_threshold: float = -1, calibration_factor: float = 1, concise_pvalue: bool = True, max_only: bool = False, careful: bool = False) float

ARG diploid association testing all clades.

Writes summary statistics to [file_root].tab.gz and returns the max chi2. Additionally, can write significant variants in [file_root].haps.gz if write_bitset_threshold is passed.

Parameters:
  • arg – arg_needle_lib.ARG object with 2*n leaves

  • phenotype – a length n array of phenotypes in the same order as the ARG samples. Missing values are specified using the use_sample array and can be filled with any value in the phenotype.

  • use_sample – a length n array of booleans, False means the phenotype is missing

  • file_root – results are written to [file_root].tab.gz and [file_root].haps.gz

  • chromosome – used for writing summary statistics

  • snp_prefix – prefix used for writing summary statistics

  • min_maf – minimum MAF to test (default=-1 which means no minimum)

  • max_maf – maximum MAF to test (default=-1 which means no maximum)

  • write_bitset_threshold – variants with p-value less than the threshold are written to [file_root].haps.gz (default=-1, no writing)

  • calibration_factor – used in ARG-MLMA workflows, can be set to 1 to instead run linear regression (default=1)

  • concise_pvalue – return only two significant digits (default=True)

  • max_only – if True, does not write to file and only returns the max chi2 (default=False)

  • careful – if True, computes a residual sum of squares from the inferred beta as part of the standard error. If False, uses the norm of the phenotype as an approximation, like BOLT. The careful version will give slightly more significant p-values, especially for significant variants. Defaults to False.

Returns:

The maximum chi2 across the tests.

arg_needle_lib.association_diploid_mutation(arg: arg_needle_lib_pybind.ARG, phenotype: List[float], use_sample: List[bool], file_root: str, mus: List[float], random_seed: int = 0, chromosome: int = 1, snp_prefix: str = '', min_maf: float = -1, max_maf: float = -1, write_bitset_threshold: float = -1, calibration_factor: float = 1, concise_pvalue: bool = True, max_only: bool = False, careful: bool = False) List[float]

ARG diploid association testing using mutation-based sampling.

Writes summary statistics to [file_root].tab.gz and returns the max chi2 for each mutation rate passed. Additionally, can write significant variants in [file_root].haps.gz if write_bitset_threshold is passed.

One of the columns written with this option is MU_STAR. For each mutation tested, the MU_STAR value says “if you instead performed testing with a smaller mutation rate, you would sample this mutation if mu > MU_STAR”. This allows one to filter the tab.gz output for more stringent mutation rates.

Parameters:
  • arg – arg_needle_lib.ARG object with 2*n leaves

  • phenotype – a length n array of phenotypes in the same order as the ARG samples. Missing values are specified using the use_sample array and can be filled with any value in the phenotype.

  • use_sample – a length n array of booleans, False means the phenotype is missing

  • file_root – results are written to [file_root].tab.gz and [file_root].haps.gz

  • mus – an array of mutation rates to use for testing. In practice, for the [file_root].tab.gz and [file_root].haps.gz output, only the largest mutation rate of the mus array is used. However, for returning the maximum chi2 over tests, a maximum chi2 is returned for each of the mutation rates in order.

  • random_seed – seed for mutation-based sampling. When set to 0, uses the time to seed (default=0).

  • chromosome – used for writing summary statistics

  • snp_prefix – prefix used for writing summary statistics

  • min_maf – minimum MAF to test (default=-1 which means no minimum)

  • max_maf – maximum MAF to test (default=-1 which means no maximum)

  • write_bitset_threshold – variants with p-value less than the threshold are written to [file_root].haps.gz (default=-1, no writing)

  • calibration_factor – used in ARG-MLMA workflows, can be set to 1 to instead run linear regression (default=1)

  • concise_pvalue – return only two significant digits (default=True)

  • max_only – if True, does not write to file and only returns the max chi2 (default=False)

  • careful – if True, computes a residual sum of squares from the inferred beta as part of the standard error. If False, uses the norm of the phenotype as an approximation, like BOLT. The careful version will give slightly more significant p-values, especially for significant variants. Defaults to False.

Returns:

An array of maximum chi2 across the tests, one for each mutation rate in mus.

arg_needle_lib.bitset_volume_map(arg: arg_needle_lib_pybind.ARG, verbose: bool = False) Dict[str, float]

Traverse the ARG and return a map of bitset string to volume

arg_needle_lib.compute_grm(arg: arg_needle_lib_pybind.ARG, alpha: float = 0, batch_size: int = 256, diploid: bool = True, min_maf: float = 0.0, max_maf: float = 0.5) numpy.ndarray[numpy.float64[m, n]]

Compute GRM from existing mutations.

Parameters:
  • arg – arg_needle_lib.ARG object

  • alpha – if -1, the variants of branches are standardized before computing the GRM. If 0, the variants of branches are treated as-is and not standardized. Values in between interpolate between these behaviors (default=-1).

  • batch_size – batch size for GRM computations (default=True)

  • diploid – if True, leaves 2*i and 2*i+1 are treated as haploid pairs for sample i when computing the GRM (default=True)

  • min_maf – minimum MAF for mutations to include in the GRM (default=0)

  • max_maf – maximum MAF for mutations to include in the GRM (default=0.5)

Returns:

A numpy matrix with the GRM (no centering operations are performed).

arg_needle_lib.deserialize_arg(file_name: str, chunk_size: int = 1000, reserved_samples: int = -1)

Deserialize ARG from disk and create an arg_needle_lib.ARG object to return

Parameters:
  • file_name – the file name of the serialized ARG file

  • chunk_size – the chunk size for reading; small numbers will dramatically increase deserialization time

  • reserved_samples – parameter for ARG constructor allowing us to reserve additional samples for threading (default is to not reserve additional samples)

Returns:

an arg_needle_lib.ARG object

arg_needle_lib.exact_arg_grm(arg, alpha=-1, from_pos=-1, to_pos=-1, diploid=True, centering=True)

Computes the exact ARG-GRM up to Gower centering and row-column centering.

Parameters:
  • arg – arg_needle_lib.ARG object

  • alpha – if -1, the variants of branches are standardized before computing the GRM. If 0, the variants of branches are treated as-is and not standardized. Values in between interpolate between these behaviors (default=-1).

  • from_pos – start physical position for ARG-GRM computation (default=-1, use beginning of ARG)

  • to_pos – end physical position for ARG-GRM computation (default=-1, use end of ARG)

  • diploid – if True, leaves 2*i and 2*i+1 are treated as haploid pairs for sample i when computing the GRM (default=True)

  • centering – if True, applies row-column centering and Gower centering to the GRM before returning (default=True)

Returns:

A numpy matrix with the exact ARG-GRM up to Gower centering and row-column centering.

arg_needle_lib.generate_m_mutations(arg: arg_needle_lib_pybind.ARG, M: int, random_seed: int = 0) None

Generate exactly M mutations and keep on the ARG

arg_needle_lib.generate_mutations(arg: arg_needle_lib_pybind.ARG, mu: float, random_seed: int = 0, num_mutations_hint: int = 0) None

Generate mutations with a given rate and keep on the ARG

arg_needle_lib.get_genotype(arg: arg_needle_lib_pybind.ARG, mutation: arg_needle_lib_pybind.Mutation, diploid: bool = False) List[int]

Get a vector representing the genotype of the mutation

arg_needle_lib.get_mutations_matrix(arg: arg_needle_lib_pybind.ARG, from_pos: float = -inf, to_pos: float = inf, include_left: bool = True, include_right: bool = False) numpy.ndarray[numpy.int8[m, n]]

Return a mutation matrix corresponding to existing mutations on the ARG

arg_needle_lib.gower_center(mat)

Apply row-column centering to a numpy square matrix in-place.

Also returns the matrix.

Note: row column centering and Gower centering commute, i.e., they can be applied in either order.

arg_needle_lib.haploid_grm_to_diploid(hap_mat)

Turns a haploid GRM into a diploid GRM by treating neighboring IDs as haploid pairs.

arg_needle_lib.impute(arg: arg_needle_lib_pybind.ARG, position: float, genotypes: List[int], old: bool = False) List[float]

ARG-based imputation described in the ARG-Needle paper

arg_needle_lib.kc2_length_stab(arg1, arg2, num_stabs, lambdas=[1])

Compares two ARGs using stabbing queries, returning length-aware KC squared.

Parameters:
  • arg1 – the first ARG

  • arg2 – the second ARG

  • num_stabs – how many stabbing queries to perform. For i from 0 to num_stabs - 1 inclusive, the proportional positions along the genome are then sampled as i*phi (mod 1), where phi is the golden ratio. See also Supplementary Note 2 in https://doi.org/10.1038/s41588-023-01379-x

  • lambdas – array-like containing values of lambdas

Returns:

List of values containing length-aware KC squared for each lambda.

arg_needle_lib.kc2_special_stab(arg, arg_true, num_stabs, special_behavior=None, merge_fraction=0, random_kc_seed=0)

Compares two ARGs using stabbing queries, returning topology KC squared under special behavior.

Parameters:
  • arg1 – the first ARG, which can be perturbed by either merging nodes or randomly resolving polytomies.

  • arg2 – the second ARG, which can be perturbed by either merging nodes or randomly resolving polytomies.

  • num_stabs – how many stabbing queries to perform. For i from 0 to num_stabs - 1 inclusive, the proportional positions along the genome are then sampled as i*phi (mod 1), where phi is the golden ratio. See also Supplementary Note 2 in https://doi.org/10.1038/s41588-023-01379-x.

  • special_behavior – can be “random_resolve”, “heuristic_merge”, or “random_merge”. Each of these is described in Supplementary Note 2 of https://doi.org/10.1038/s41588-023-01379-x

  • merge_fraction – used by heuristic_merge and random_merge to figure out how many nodes to merge in each stabbing query tree. Default is 0.

  • random_kc_seed – used by random_resolve and random_merge to perform random behavior. The C++ RNG is seeded with this seed, and then iterates through the different stabbing query positions, consuming randomness from the RNG as necessary. Therefore different stabbing query trees will have different random behavior, because the RNG is only seeded once. Default is 0, but for random_resolve and random_merge, we check that the seed is nonzero before continuing so that the user is aware of the importance of seeding.

Returns:

The topology KC squared value, under the appropriate special behavior.

arg_needle_lib.kc2_tmrca_sv_stab(arg1, arg2, num_stabs)

Compares two ARGs using stabbing queries, returning topology KC squared, TMRCA MSE, and the split-vector (SV) metric squared and N-normalised.

The SV metric is computed as the L2 distance between the split-size vectors of arg1 and arg2. The split-size vector enumerates num_leaves(mrca(i, j)) for all leaf pairs {i, j}. This metric is due to Smith, https://doi.org/10.1093/sysbio/syab100.

Parameters:
  • arg1 – the first ARG

  • arg2 – the second ARG

  • num_stabs – how many stabbing queries to perform. For i from 0 to num_stabs - 1 inclusive, the proportional positions along the genome are then sampled as i*phi (mod 1), where phi is the golden ratio. See also Supplementary Note 2 in https://doi.org/10.1038/s41588-023-01379-x

Returns:

Dictionary with keys “kc2”, “tmrca_mse”, “sv2” and float values.

arg_needle_lib.local_volume(arg: arg_needle_lib_pybind.ARG, min_position: float, max_position: float) float

Get the local arg volume

arg_needle_lib.monte_carlo_arg_grm(arg, monte_carlo_mu, seed=1234, alpha=-1, diploid=True, centering=True, min_maf=0.0, max_maf=0.5, batch_size=256)

Computes the Monte Carlo ARG-GRM.

This version wraps the C++ Eigen implementation and is more memory-efficient than the numpy version monte_carlo_arg_grm_numpy.

Parameters:
  • arg – arg_needle_lib.ARG object

  • monte_carlo_mu – mutation rate used for sampling mutations to compute the ARG-GRM

  • seed – random seed used for sampling (default=1234)

  • alpha – if -1, all variants are standardized before computing the GRM. If 0, variants are treated as-is and not standardized. Values in between interpolate between these behaviors (default=-1).

  • diploid – if True, leaves 2*i and 2*i+1 are treated as haploid pairs for sample i when computing the GRM (default=True)

  • centering – if True, applies row-column centering and Gower centering to the GRM before returning (default=True)

  • min_maf – exclusive minimum MAF (default=0.)

  • max_maf – inclusive maximum MAF (default=0.5)

  • batch_size – batch size when computing the ARG-GRM (default=256)

Returns:

A numpy matrix with the estimated Monte Carlo ARG-GRM.

arg_needle_lib.mutation_best(arg: arg_needle_lib_pybind.ARG, position: float, genotypes: List[int], random_seed: int = 0) int

Hamming distance for best mutation placement on ARG at a position

arg_needle_lib.mutation_match(arg: arg_needle_lib_pybind.ARG, position: float, genotypes: List[int]) bool

Boolean for whether genotypes can arise from a single mutation at position

arg_needle_lib.mutation_match_all(arg, ts_with_mutations, resolve_reps=0)

Tries to best match a given set of mutations on to an ARG, returning results.

Parameters:
  • argarg_needle_lib.ARG object (usually one that has been inferred)

  • ts_with_mutationstskit.TreeSequence object containing mutations, that we want to check for on arg

  • resolve_reps (int) – how many times to randomly resolve polytomies in the arg object when matching. This is unnecessary if arg contains no polytomies, but otherwise allows for matching against resolved binary trees, averaging over several random resolutions. Default 0, which corresponds to no resolving of polytomies.

Returns:

Two lists (errors, diffs), each of length equal to the number of mutations in

ts_with_mutations. For each mutation, errors gives 0 if a mutation can be correctly placed on the ARG to give that genotype at that position, 1 if not, and a value in between if random resolution of polytomies sometimes gives a match and sometimes does not. diffs gives the Hamming distance between the best possible placed mutation and the genotype from ts_with_mutations, with the mean Hamming distance in the case of random resolution of polytomies.

arg_needle_lib.num_lineages(arg: arg_needle_lib_pybind.ARG, position: float, height: float) int

Count the number of lineages at a given position and height

arg_needle_lib.rf_resolve_stab(arg1, arg2, num_stabs, resolve_seeds, min_mac=0, max_mac=0)

Computes scaled Robinson-Foulds using stabbing queries, with polytomies randomly resolved.

Parameters:
  • arg1 – the first ARG

  • arg2 – the second ARG

  • num_stabs – how many stabbing queries to perform. For i from 0 to num_stabs - 1 inclusive, the proportional positions along the genome are then sampled as i*phi (mod 1), where phi is the golden ratio. See also Supplementary Note 2 in https://doi.org/10.1038/s41588-023-01379-x.

  • resolve_seeds – a list of seeds to use for randomly resolving polytomies.

  • min_mac – min MAC (inclusive)

  • max_mac – max MAC (exclusive, value of 0 means set to maximum)

Returns:

Dictionary with key “scaled_robinson_foulds” and a float value, as well as the less

used keys “branch_recall” and “branch_precision”.

arg_needle_lib.rf_total_variation_stab(arg1, arg2, num_stabs, min_mac=0, max_mac=0, may_contain_polytomies=False)

Computes scaled Robinson-Foulds and total variation using stabbing queries.

In this version, polytomies are not resolved, which may bias the metric. See also rf_resolve_stab.

Parameters:
  • arg1 – the first ARG

  • arg2 – the second ARG

  • num_stabs – how many stabbing queries to perform. For i from 0 to num_stabs - 1 inclusive, the proportional positions along the genome are then sampled as i*phi (mod 1), where phi is the golden ratio. See also Supplementary Note 2 in https://doi.org/10.1038/s41588-023-01379-x.

  • min_mac – min MAC (inclusive)

  • max_mac – max MAC (exclusive, value of 0 means set to maximum)

  • may_contain_polytomies – if False, and the two ARGs do not have polytomies and there are no MAC filters, then a simple check is done verifying that calculations are done correctly. If there may be polytomies, then set to True. See also rf_resolve_stab which will randomly resolve the polytomies. Default = False.

Returns:

Dictionary with keys “scaled_robinson_foulds” and “total_variation” and float values, as

well as the less used keys “branch_recall”, “branch_precision”, “mutation_recall”, and “mutation_precision”.

arg_needle_lib.row_column_center(mat)

Apply row-column centering to a numpy square matrix in-place.

Also returns the matrix.

Note: row column centering and Gower centering commute, i.e., they can be applied in either order.

arg_needle_lib.serialize_arg(arg_to_serialize: ARG, file_name: str = 'arg.arg', chunk_size: int = 1000, compression='gzip', compression_opts=None, node_bounds=True, mutations=None, sites=None)

Serialize ARG to disk using HDF5.

By default, gzip compression with level 9 will be used, but available options are listed below. Compression can be turned off by passing compression=None.

Parameters:
  • arg_to_serialize – arg_needle_lib.ARG object to serialize

  • file_name – the file name to serialize the ARG to

  • chunk_size – the chunk size for writing; small numbers will dramatically increase serialization time

  • compression – whether to use compression; can save up to ~50% on disk at a runtime cost Refer to h5py documentation for details gzip provides good compression with compression_opts=9, with ~+50% runtime for serialization lzf provides poor compression, with little runtime overhead szip provides good compression, with little runtime overhead, but is not available on all systems

  • compression_opts – the level of compression requested for gzip: 9 for highest compression. Should be None if compression != ‘gzip’

  • node_bounds – whether to serialize node start/end positions. Default: True

  • mutations – whether to serialize mutations. Default: True if num_mutations > 0, otherwise False

  • sites – whether to serialize sites. Default: True if sites present, otherwise False

arg_needle_lib.stab_return_all_bitsets(arg: arg_needle_lib_pybind.ARG, position: float) List[Tuple[int, float, List[bool]]]

All bitsets at a position of the ARG, each tuple with values allele count / length / bitset

arg_needle_lib.total_volume(arg: arg_needle_lib_pybind.ARG) float

Get ARG volume

arg_needle_lib.trim_arg(arg: arg_needle_lib_pybind.ARG, trim_start: float, trim_end: float) arg_needle_lib_pybind.ARG

Trim ARG down to the specified interval. Start and end positions do not include the offset.

arg_needle_lib.tskit_to_arg(ts, reserved_samples=-1)

Convert tskit.TreeSequence to arg_needle_lib.ARG using a custom constructor

Parameters:
  • ts – tskit.TreeSequence object

  • reserved_samples (int) – Parameter from ARG constructor allowing us to reserve additional samples for threading (default is to not reserve additional samples).

Returns:

An arg_needle_lib.ARG object.

arg_needle_lib.validate_serialized_arg(file_name: str)

Validate a serialized ARG on disk. This method returns True if the file is valid, and False otherwise. A message indicating the fault will be printed if the serialized ARG file is invalid.

Parameters:

file_name – the file name of the serialized ARG file

Returns:

whether the serialized ARG file is valid

arg_needle_lib.write_bitsets(arg: arg_needle_lib_pybind.ARG, file_root: str = '', diploid: bool = False, chromosome: int = 1, snp_prefix: str = '', min_mac: int = 0, max_mac: int = 0, write_dosage: bool = False, use_gz: bool = False, count_only: bool = False) int

Write out ARG branches as bitsets

arg_needle_lib.write_grm(grm, num_sites, path_prefix, write_binary=False)

Writes a GRM to file in a format that can be used by GCTA.

Parameters:
  • grm – numpy matrix containing the GRM

  • num_sites – number of sites used to compute the GRM, expected but typically unused by downstream applications

  • path_prefix – string, results will be written to path_prefix.grm.*

  • write_binary – if True, write a binary file format that is observed to yield faster write times and can be read by GCTA. If False, write using gzip (default=False).

arg_needle_lib.write_mutations_to_haps(arg: arg_needle_lib_pybind.ARG, file_root: str, min_maf: float = 0, max_maf: float = 1, min_time: float = 0, max_time: float = inf) None

Write mutations to disk in haps/samples format