#!/usr/bin/env python3

#
# h5m2csv.py: Convert HDF5 SNP matrix to CSV
#
# (c) 2021 by Joffrey Fitz (joffrey.fitz@tuebingen.mpg.de),
# Max Planck Institute for Developmental Biology,
# Tuebingen, Germany
#

import h5py, numpy

f = h5py.File('1001_SNP_MATRIX/imputed_snps_binary.hdf5','r')

# Get all SNP positions for all chromosomes (len=10709949)
positions = f['positions'][:]

# Array of tupels with start/stop indices for each chromosome
chr_regions = f['positions'].attrs['chr_regions']

# Array of SNP positions for all chromosomes, each chromosome is a hash
# with "Chr<N>" as key, and a numpy.array of positions as value. 
snp_pos_on_chrs = [
	{ "label": "Chr1", "chr_idx": 0, "positions": positions[chr_regions[0][0]:chr_regions[0][1]] },
	{ "label": "Chr2", "chr_idx": 1, "positions": positions[chr_regions[1][0]:chr_regions[1][1]] },
	{ "label": "Chr3", "chr_idx": 2, "positions": positions[chr_regions[2][0]:chr_regions[2][1]] },
	{ "label": "Chr4", "chr_idx": 3, "positions": positions[chr_regions[3][0]:chr_regions[3][1]] },
	{ "label": "Chr5", "chr_idx": 4, "positions": positions[chr_regions[4][0]:chr_regions[4][1]] }
]

# Print header
print("#Chromosome", "Positon", "Count_zeros", "Count_ones"
	, ",".join(f['accessions'][:].astype(str)), sep="," )

# Loop over all chromosomes
for chr in snp_pos_on_chrs:
	
	# Loop over all positions
	for pos in numpy.nditer(chr["positions"]):
		# Find index of a specific position
		ix = numpy.where(chr["positions"] == pos)[0][0]

		# Add chromosome start position to SNP position
		ix = ix + chr_regions[chr["chr_idx"]][0]
	
		# Get the corresponding SNPs for that position
		snps = f['snps'][ix]

		# Count 0s in snps
		cnt_zeros = numpy.count_nonzero(snps==0)

		# Count 1s in snp
		cnt_ones = numpy.count_nonzero(snps==1)

		print(chr["label"], pos, cnt_zeros, cnt_ones
			, ",".join(snps.astype(str)), sep=",")
	

