Hello,
I am a student analyzing microbiomes using QIIME in a Linux environment. The initial steps, from demultiplexing to clustering, proceed without any issues. However, during the taxonomy filtering step, the resulting tax_filtered.tsv
and tax_filtered_modified.tsv
files are identical. The code used in this process is as follows:
cd 6_taxonomy_assigned/
python taxonomy_cleaning.py taxonomy.tsv filtered_string unclassified_string
cd ..
Additionally, the taxonomy_cleaning.py script is as follows:
#!usr/bin/python | ||||
---|---|---|---|---|
import sys | ||||
file1 = sys.argv[1] | ||||
file2 = sys.argv[2] | ||||
file3 = sys.argv[3] | ||||
f1 = open(file1) | ||||
g1 = f1.readlines() | ||||
h1 = | ||||
for a in g1: | ||||
h1.append(a.replace('\n','').split('\t')) | ||||
tax = | ||||
filtered_id = | ||||
for a in h1: | ||||
if 'unassigned' not in a[1].lower(): | ||||
tax.append(a) | ||||
else: | ||||
filtered_id.append(a[0]) | ||||
f2 = open(file2) | ||||
g2 = f2.readlines() | ||||
filter_list = | ||||
for a in g2: | ||||
filter_list.append(a.replace('\n','').lower()) | ||||
f3 = open(file3) | ||||
g3 = f3.readlines() | ||||
modify_list = | ||||
for a in g3: | ||||
modify_list.append(a.replace('\n','').lower()) | ||||
filtered = | ||||
for k in range(len(tax)): | ||||
check = | ||||
for a in filter_list: | ||||
check.append(a in tax[k][1].lower()) | ||||
if True not in check: | ||||
filtered.append(tax[k]) | ||||
else: | ||||
filtered_id.append(tax[k][0]) | ||||
filtered_1 = | ||||
for a in filtered: | ||||
filtered_1.append('\t'.join(a)) | ||||
filtered_2 = '\n'.join(filtered_1) + '\n' | ||||
f = open('tax_filtered.tsv','w') | ||||
f.write(filtered_2) | ||||
f.close() | ||||
filtered_id.insert(0, '#SampleID') | ||||
f = open('filtered_id', 'w') | ||||
f.write('\n'.join(filtered_id) + '\n') | ||||
f.close() | ||||
head = filtered[0] | ||||
body = filtered[1:] | ||||
string = | ||||
adding = ';Unclassified' | ||||
for a in body: | ||||
k = len(a[1].split(';')) | ||||
string.append(a[1] + adding*(7-k)) | ||||
for k1 in range(len(string)): | ||||
tmp = string[k1].split(';') | ||||
for k2 in range(7): | ||||
for b in modify_list: | ||||
if b in tmp[k2].lower(): | ||||
tmp[k2] = 'Unclassified' | ||||
string[k1] = ';'.join(tmp) | ||||
for k in range(len(body)): | ||||
body[k][1] = string[k] | ||||
body.insert(0, head) | ||||
modified = body | ||||
modified_1 = | ||||
for a in modified: | ||||
modified_1.append('\t'.join(a)) | ||||
modified_2 = '\n'.join(modified_1) + '\n' | ||||
f = open('tax_filtered_modified.tsv', 'w') | ||||
f.write(modified_2) | ||||
f.close() |
taxonomy_clenaning.py is
unidentified
unclassified
Incertae Sedis
Unknown
filtered_string is
D_0__Archaea
Chloroplast
Mitochondria
Cyanobacteria
Rickettsia
Eukaryota
Archaea
please help me...