CSV to PLINK
: csv to (map, ped)
- CSV format : columns=[CHR, id, pheno, variant1, variant2,...]
- CHR : chromosome
- pheno : case 1 / control 0
#!/usr/bin/env python
import sys
infile_name = sys.argv[1]
pedDict = {
"0" : "0 0",
"1" : "A A",
"2" : "A a",
"3" : "a a"
}
def convertToPlink(infile_name):
with open(infile_name, 'r') as infile:
header = infile.readline().rstrip().split()
chromosome = infile.readline().split()[0]
with open('test' + chromosome + '.map', 'w') as mapfile:
a=1
for POS in header[3:]:
mapfile.write("\t".join([chromosome,POS, "0", str(a)])+"\n")
a+=1
with open(infile_name, 'r') as infile:
with open('test' + chromosome + '.ped', 'w') as pedfile:
id_index = 0
for line in infile:
if not line.startswith("CHR"):
id_index += 1
ID = "i_" + str(id_index)
line = line.rstrip().split()
IID= line[1]
pheno = str(int(line[2])+1)
pedfile.write(" ".join([ID, IID, "0", "0", "0",pheno]+[pedDict[genotype] for genotype in line[3:]])+ "\n")
convertToPlink(infile_name)
Include charactors or words
df=pd.read_csv('mydata.csv')
col=df.columns
searchfor=['hel','good']
s1=col[col.str.contains('|'.join(searchfor))]
print(s1.values)
['hello', 'hellgate', 'good_morning', 'good_afternoon']
Split by separator
a=[['asd/123',34],['qwe/234',45],['zxc/456',67],['fgh/678',89]]
df=pd.DataFrame(a)
print(df)
print()
print(df[0].str.split('/'))
print(df[0].str.split('/').shape)
print(df[0].str.split('/',expand=True))
print(df[0].str.split('/',expand=True).shape)
0 1
0 asd/123 34
1 qwe/234 45
2 zxc/456 67
3 fgh/678 89
0 [asd, 123]
1 [qwe, 234]
2 [zxc, 456]
3 [fgh, 678]
Name: 0, dtype: object
(4,)
0 1
0 asd 123
1 qwe 234
2 zxc 456
3 fgh 678
(4, 2)
Merge two dataframes
df_final=pd.merge(df_left,df_right,left_index=True, right_index=True,how='left')
Contains value
import pandas as pd
df=pd.read_csv('mydata.csv',sep='\t+|\s+',header=None)
col=df[1]
s1=col[col.str.contains('.1', regex=False)]
print(len(s1))
String to number in DF
df=df.apply(pd.to_numeric, errors='ignore')
Merge dataframes
df2=pd.merge(df,df1, how='outer', left_index=True, right_index=True)
Display all
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
print(df)
Separate ID
import pandas as pd
df =pd.read_csv('list',sep='/',header=None)
df0=df[5].str.split('.',expand=True)[0]
step=100
for j in range(0,df.shape[0],step):
if j== df.shape[0]//step*step:
step=df.shape[0]%step
ini=j
fin=ini+step
df1=pd.DataFrame(['hs38DH_re.txt'])
df1=df1.append([step], ignore_index=True)
for i in enumerate(df[5].tolist()):
if (i[0]>=ini) and (i[0]<fin):
df1=df1.append([i[1]], ignore_index=True)
for i in enumerate(df0.tolist()):
if (i[0]>=ini) and (i[0]<fin):
df1=df1.append([i[1]], ignore_index=True)
print(df1)
#print(df1[0].to_string(index=False))
df1.to_csv('In_File_'+str(ini)+'.txt',index=None,header=None)
Select data
import sys
import time
def help():
print("Usage: python {} [ chrn ] \n".format(sys.argv[0]))
exit()
# decorator
def runTime(func):
def wrapper(*args, **kwargs):
start = time.time()
result = func(*args, **kwargs)
end = time.time()
print('# module : {0} / time(sec) : {1}'.format(func.__name__, end - start))
return result
return wrapper
@runTime
def readFiles():
inFile=open(inf,'r')
outFile=open(out,'w')
bedFile=open(inbed,'r')
a0='1 0'
while 1:
a=bedFile.readline()
b='1 0'
if not a :break
if a.split()[1]==a0.split()[1]:continue
a0=a
while a0.split()[1]!=b.split()[1]:
b=inFile.readline()
outFile.write(b)
if __name__ == "__main__":
if not(len(sys.argv) == 2):
help()
chrn = sys.argv[1]
#
inf="Out_chr{}.txt".format(chrn)
out="Out_chr{}.sel".format(chrn)
# selection range
inbed = "selected_range_chr{}.sel".format(chrn)
fadata = [0]
bedData = []
readFiles()
sys.exit(0)
ADD
def plink_cmd(outputFile):
inputFile=outputFile+'_ibd'
df=pd.read_table(outputFile+'.ann',header=None)
def add_gene(header,col,tempExt,outputExt):
cmd='head -n 1 '+outputFile+'.'+tempExt
lh = len(subprocess.check_output(cmd,shell=True,encoding='utf-8').split())
cmd='echo '+header+' > '+outputFile+'.tmp && awk \'{print $'+str(col)+'}\' '+outputFile+'.'+tempExt
snps=subprocess.check_output(cmd,shell=True,encoding='utf-8').split('\n')[1:-1]
genes=parmap.map(find_gene,snps,df,pm_pbar=True,pm_processes=40)
with open(outputFile+'.tmp','a') as f:
for i in genes:
f.write(i+'\n')
cols='\"\\t\"'.join(map(lambda x:'$'+str(x),range(1,col+1)))
endCols='\"\\t\"'.join(map(lambda x:'$'+str(x),range(col+1,lh+1)))
if endCols=='':
cmd='awk \'{print '+cols+'}\' '+outputFile+'.'+tempExt+'|paste - '+outputFile+'.tmp >'+outputFile+'.'+outputExt
else:
cmd='awk \'{print '+cols+'}\' '+outputFile+'.'+tempExt+'|paste - '+outputFile+'.tmp <(awk \'{print '+endCols+'}\' '+outputFile+'.'+tempExt+') >'+outputFile+'.'+outputExt
subprocess.run(cmd,shell=True, executable="/bin/bash", stdout=subprocess.DEVNULL)
add_gene('GENE',3,'txt','reg')