Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
ahoek
wasp_sven_test
Commits
6681c791
Commit
6681c791
authored
Nov 11, 2021
by
ahoek
Browse files
Add neccessary files for WASP Snakemake workflow
parent
d67f962e
Changes
14
Expand all
Hide whitespace changes
Inline
Side-by-side
Samples/.gitkeep
0 → 100644
View file @
6681c791
Scripts/demultiMetrics.py
0 → 100755
View file @
6681c791
#!/usr/bin/env python3
import
sys
import
json
import
os
import
pysam
# Reading files
cellcounter
=
sys
.
argv
[
1
]
mapping
=
sys
.
argv
[
2
]
validBc
=
pysam
.
AlignmentFile
(
sys
.
argv
[
3
],
'rb'
)
sampleName
=
sys
.
argv
[
4
]
# Opening file with number of counted cell barcodes
with
open
(
cellcounter
,
'r'
)
as
cFile
:
# Getting the number of counted cell barcodes
cellbarcodes
=
0
cellbarcodes
=
cFile
.
readline
().
strip
().
split
(
"
\t
"
)
#print(cellbarcodes[1])
# Opening STAR log file
with
open
(
mapping
,
'r'
)
as
mFile
:
rawReads
=
0
# Reading number of raw reads
for
count
,
line
in
enumerate
(
mFile
):
if
count
==
5
:
rawReads
=
line
.
strip
().
split
(
"
\t
"
)
#print(rawReads)
# Opening file with all reads with valid barcodes
# Reading every line
bcReads
=
0
for
line
in
validBc
.
fetch
(
until_eof
=
True
):
read
=
str
(
line
).
split
(
"
\t
"
)
#print(read[1])
# Check if read is not a multimapping
if
int
(
read
[
1
])
!=
256
and
int
(
read
[
1
])
!=
272
:
bcReads
=
bcReads
+
1
#print(bcReads)
# Saving all stats in a dictionary
sampleCells
=
{
"Sample"
:
sampleName
,
"Cell_barcodes"
:
int
(
cellbarcodes
[
1
]),
"Raw_reads"
:
int
(
rawReads
[
1
]),
"Reads_with_valid_barcode"
:
int
(
bcReads
)}
#print(sampleCells)
# Creating a name for JSON file
jsonFile
=
os
.
path
.
join
(
"Results/"
+
sampleName
+
"/JSON_Files/"
+
sampleName
+
"_Cell_Numbers.json"
)
# Writing the dictionary in a JSON file
with
open
(
jsonFile
,
'w'
)
as
jFile
:
json
.
dump
(
sampleCells
,
jFile
)
Scripts/demulti_umi.py
0 → 100755
View file @
6681c791
#!/usr/bin/env python3
# Import of modules
import
pysam
import
sys
import
Levenshtein
import
py
# Reading-in BAM file of featureCounts analysis
bamFiles
=
pysam
.
AlignmentFile
(
sys
.
argv
[
1
],
"rb"
)
fileForDetection
=
pysam
.
AlignmentFile
(
sys
.
argv
[
1
],
"rb"
)
headerFile
=
pysam
.
AlignmentFile
(
sys
.
argv
[
2
],
"wb"
,
template
=
bamFiles
)
bclist10xv2
=
set
(
line
.
strip
()
for
line
in
open
(
'Scripts/whitelist_10X_737K-august-2016.txt'
))
bclist10xv3
=
set
(
line
.
strip
()
for
line
in
open
(
'Scripts/whitelist_10X_3M-february-2018.txt'
))
# Barcode whitelist
barcodeList
=
[
"AACAGC"
,
"ATACTT"
,
"CCTCTA"
,
"AACGTG"
,
"ATAGCG"
,
"CGAAAG"
,
"GAGGCC"
,
"GTACAG"
,
"TCTAGC"
,
"AAGCCA"
,
"ATATAC"
,
"CGAGCA"
,
"GAGTGA"
,
"AAAGAA"
,
"AGTCTG"
,
"GAGCTT"
,
"AAGTAT"
,
"ATCCGG"
,
"CGCATA"
,
"GATCAA"
,
"AATTGG"
,
"ATGAAG"
,
"ACAAGG"
,
"ATTAGT"
,
"CCGTAA"
,
"GACTCG"
,
"GGTAGG"
,
"TCGCCT"
,
"GGTGCT"
,
"TCGGGA"
,
"GTCCTA"
,
"TGAATT"
,
"GTCGGC"
,
"TGAGAC"
,
"CGGCGT"
,
"GCCAGA"
,
"GTGGTG"
,
"TGCGGT"
,
"CGGTCC"
,
"GCCGTT"
,
"GTTAAC"
,
"TGCTAA"
,
"GTTTCA"
,
"TGGCAG"
,
"ACCCAA"
,
"CAACCG"
,
"CGTTAT"
,
"GCGAAT"
,
"ACCTTC"
,
"GCGCGG"
,
"TAAGCT"
,
"TGTGTA"
,
"ACGGAC"
,
"CACCAC"
,
"CTATTA"
,
"GCTCCC"
,
"TAATAG"
,
"TGTTCG"
,
"ACTGCA"
,
"CACTGT"
,
"CTCAAT"
,
"GCTGAG"
,
"TACCGA"
,
"TTAAGA"
,
"AGACCC"
,
"CAGACT"
,
"CTGTGG"
,
"GCTTGT"
,
"AGATGT"
,
"CAAGTC"
,
"CTAGGT"
,
"CAGGAG"
,
"CTTACG"
,
"AGCACG"
,
"CATAGA"
,
"CTTGAA"
,
"TAGAGG"
,
"TTCGCA"
,
"GGACGA"
,
"TATTTC"
,
"TTCTTG"
,
"GGATTG"
,
"TCAGTG"
,
"TTGCTC"
,
"GGCCAT"
,
"TCATCA"
,
"TTGGAT"
,
"AGGTTA"
,
"CCACGC"
,
"GAAATA"
,
"AGTAAA"
,
"CCGATG"
,
"GAAGGG"
,
"GGGATC"
,
"TCCAAG"
,
"TTTGGG"
]
# Function to compare two sequences, allowing 1 ED
linker1
=
'TAGCCATCGCATTGC'
linker2
=
'TACCTCTGAGCTGAA'
def
linkerPos
(
read
,
linker
):
min
=
1
pos
=
0
hit
=
False
for
i
in
range
(
len
(
read
)
-
len
(
linker
)
+
1
):
substr
=
read
[
i
:
i
+
len
(
linker
)]
hamDist
=
Levenshtein
.
hamming
(
substr
,
linker
)
if
hamDist
<=
min
:
min
=
hamDist
pos
=
i
+
1
hit
=
True
return
(
hit
,
pos
)
# Function to compare barcodes of read with barcodes of whitelist
def
validBarcodes
(
rbc
,
bcList
):
newBarcode
=
""
for
bcL
in
bcList
:
if
Levenshtein
.
hamming
(
rbc
,
bcL
)
<=
1
:
newBarcode
=
bcL
return
newBarcode
def
checkBarcode10xv2
():
for
read
in
bamFiles
.
fetch
(
until_eof
=
True
):
# Splitting barcode of read
readSeq
=
read
.
qname
.
split
(
":"
)
rawBarcode
=
readSeq
[
7
]
rawBarcode
=
rawBarcode
.
strip
()
readBC
=
rawBarcode
[
0
:
16
]
#if not rawBarcode.startswith("*"):
if
readBC
in
bclist10xv2
:
validBc
=
readBC
readUmi
=
rawBarcode
[
16
:
26
]
if
"Assigned"
in
str
(
read
):
bcRead
=
str
(
read
)
spRead
=
bcRead
.
split
(
"
\t
"
)
a
=
pysam
.
AlignedSegment
()
a
.
query_name
=
str
(
readSeq
[
0
]
+
":"
+
readSeq
[
1
]
+
":"
+
readSeq
[
2
]
+
":"
+
readSeq
[
3
]
+
":"
+
readSeq
[
4
]
+
":"
+
readSeq
[
5
]
+
":"
+
readSeq
[
6
]
+
"_"
+
validBc
+
"_"
+
readUmi
)
a
.
mapping_quality
=
int
(
spRead
[
4
])
a
.
query_sequence
=
spRead
[
9
]
a
.
flag
=
int
(
spRead
[
1
])
a
.
reference_start
=
int
(
spRead
[
3
])
a
.
reference_id
=
int
(
spRead
[
2
])
a
.
tags
=
((
"NH"
,
read
.
get_tag
(
"NH"
)),
(
"XS"
,
read
.
get_tag
(
"XS"
)),
(
"XT"
,
read
.
get_tag
(
"XT"
)))
headerFile
.
write
(
a
)
else
:
bcRead
=
str
(
read
)
spRead
=
bcRead
.
split
(
"
\t
"
)
a
=
pysam
.
AlignedSegment
()
a
.
query_name
=
str
(
readSeq
[
0
]
+
":"
+
readSeq
[
1
]
+
":"
+
readSeq
[
2
]
+
":"
+
readSeq
[
3
]
+
":"
+
readSeq
[
4
]
+
":"
+
readSeq
[
5
]
+
":"
+
readSeq
[
6
]
+
"_"
+
validBc
+
"_"
+
readUmi
)
a
.
mapping_quality
=
int
(
spRead
[
4
])
a
.
query_sequence
=
spRead
[
9
]
a
.
flag
=
int
(
spRead
[
1
])
a
.
reference_start
=
int
(
spRead
[
3
])
a
.
reference_id
=
int
(
spRead
[
2
])
a
.
tags
=
((
"NH"
,
read
.
get_tag
(
"NH"
)),
(
"XS"
,
read
.
get_tag
(
"XS"
)))
headerFile
.
write
(
a
)
def
checkBarcode10xv3
():
for
read
in
bamFiles
.
fetch
(
until_eof
=
True
):
# Splitting barcode of read
readSeq
=
read
.
qname
.
split
(
":"
)
rawBarcode
=
readSeq
[
7
]
rawBarcode
=
rawBarcode
.
strip
()
readBC
=
rawBarcode
[
0
:
16
]
#if not rawBarcode.startswith("*"):
if
readBC
in
bclist10xv3
:
validBc
=
readBC
readUmi
=
rawBarcode
[
16
:
29
]
if
"Assigned"
in
str
(
read
):
bcRead
=
str
(
read
)
spRead
=
bcRead
.
split
(
"
\t
"
)
a
=
pysam
.
AlignedSegment
()
a
.
query_name
=
str
(
readSeq
[
0
]
+
":"
+
readSeq
[
1
]
+
":"
+
readSeq
[
2
]
+
":"
+
readSeq
[
3
]
+
":"
+
readSeq
[
4
]
+
":"
+
readSeq
[
5
]
+
":"
+
readSeq
[
6
]
+
"_"
+
validBc
+
"_"
+
readUmi
)
a
.
mapping_quality
=
int
(
spRead
[
4
])
a
.
query_sequence
=
spRead
[
9
]
a
.
flag
=
int
(
spRead
[
1
])
a
.
reference_start
=
int
(
spRead
[
3
])
### FIX FOR PYSAM-VERSION 0.17.0 ###
# Changes spRead[2] from "number" to "#number" e.g. "2" to "#2"
# and causes an error when trying to convert to int
# Fix does not work with older pysam-versions!
tmp
=
spRead
[
2
].
replace
(
"#"
,
""
)
a
.
reference_id
=
int
(
tmp
)
a
.
tags
=
((
"NH"
,
read
.
get_tag
(
"NH"
)),
(
"XS"
,
read
.
get_tag
(
"XS"
)),
(
"XT"
,
read
.
get_tag
(
"XT"
)))
headerFile
.
write
(
a
)
else
:
bcRead
=
str
(
read
)
spRead
=
bcRead
.
split
(
"
\t
"
)
a
=
pysam
.
AlignedSegment
()
a
.
query_name
=
str
(
readSeq
[
0
]
+
":"
+
readSeq
[
1
]
+
":"
+
readSeq
[
2
]
+
":"
+
readSeq
[
3
]
+
":"
+
readSeq
[
4
]
+
":"
+
readSeq
[
5
]
+
":"
+
readSeq
[
6
]
+
"_"
+
validBc
+
"_"
+
readUmi
)
a
.
mapping_quality
=
int
(
spRead
[
4
])
a
.
query_sequence
=
spRead
[
9
]
a
.
flag
=
int
(
spRead
[
1
])
a
.
reference_start
=
int
(
spRead
[
3
])
### FIX FOR PYSAM-VERSION 0.17.0 ###
# Changes spRead[2] from "number" to "#number" e.g. "2" to "#2"
# and causes an error when trying to convert to int
# Fix does not work with older pysam-versions!
tmp
=
spRead
[
2
].
replace
(
"#"
,
""
)
a
.
reference_id
=
int
(
tmp
)
a
.
tags
=
((
"NH"
,
read
.
get_tag
(
"NH"
)),
(
"XS"
,
read
.
get_tag
(
"XS"
)))
headerFile
.
write
(
a
)
# print("Barcode: ", rawBarcode[0:17])
# print("UMI: ", rawBarcode[17:29])
def
checkddSeq
():
# Illumina algorithm follows:
# Reading-in each read from BAM file
for
read
in
bamFiles
.
fetch
(
until_eof
=
True
):
# Splitting barcode of read
readSeq
=
read
.
qname
.
split
(
":"
)
rawBarcode
=
readSeq
[
7
]
# print(readSeq)
# Determine position of linker 1
posLinker1
=
linkerPos
(
rawBarcode
,
linker1
)
# print posLinker1
# Position of linker 1 has to be bigger than 6, for BC1 to be complete
if
posLinker1
[
1
]
>
5
:
# and int(posLinker1[1])<14:
# print(posLinker1)
# Determine position of linker 2
posLinker2
=
linkerPos
(
rawBarcode
,
linker2
)
# print(posLinker2)
# if posLinker2[0]:
# print(posLinker2)
# Position of linker 2 has to be exactly position of linker 1 + 21 bases (no insertion or deletion allowed)
if
int
(
posLinker1
[
1
])
+
21
==
int
(
posLinker2
[
1
]):
# print(posLinker2)
# Cutting of phase block at the beginning of the read
matchLinker
=
rawBarcode
[
int
(
posLinker1
[
1
])
-
7
:]
# print(matchLinker)
# Cutting of access bases at the end of the read
matchLinker
=
matchLinker
[:
int
(
posLinker2
[
1
]
+
34
)]
# print(matchLinker)
# Read has to be at least 62 bases to be complete
if
len
(
matchLinker
)
>=
62
:
# print(matchLinker)
# Extracting ACG and GAC sequences
acg
=
matchLinker
[
48
:
51
]
# print(acg)
gac
=
matchLinker
[
59
:
62
]
# print(gac)
# Controlling sequences while allowing 1 ED
if
Levenshtein
.
hamming
(
acg
,
"ACG"
)
<=
1
and
Levenshtein
.
hamming
(
gac
,
"GAC"
)
<=
1
:
# print(matchLinker)
# Comparing barcodes with witelist and extracting UMI sequence
bc1
=
validBarcodes
(
matchLinker
[
0
:
6
],
barcodeList
)
# print bc1
bc2
=
validBarcodes
(
matchLinker
[
21
:
27
],
barcodeList
)
# print bc2
bc3
=
validBarcodes
(
matchLinker
[
42
:
48
],
barcodeList
)
# print bc3
validBc
=
bc1
+
bc2
+
bc3
# print validBc
readUmi
=
matchLinker
[
51
:
59
]
# print(validBc + "\t" + readUmi)
# Composed barcodes have to have a length of exactly 18 bases
if
len
(
validBc
)
==
18
and
"Assigned"
in
str
(
read
):
bcRead
=
str
(
read
)
spRead
=
bcRead
.
split
(
"
\t
"
)
a
=
pysam
.
AlignedSegment
()
a
.
query_name
=
str
(
readSeq
[
0
]
+
":"
+
readSeq
[
1
]
+
":"
+
readSeq
[
2
]
+
":"
+
readSeq
[
3
]
+
":"
+
readSeq
[
4
]
+
":"
+
readSeq
[
5
]
+
":"
+
readSeq
[
6
]
+
"_"
+
validBc
+
"_"
+
readUmi
)
a
.
mapping_quality
=
int
(
spRead
[
4
])
a
.
query_sequence
=
spRead
[
9
]
a
.
flag
=
int
(
spRead
[
1
])
a
.
reference_start
=
int
(
spRead
[
3
])
a
.
reference_id
=
int
(
spRead
[
2
])
a
.
tags
=
(
(
"NH"
,
read
.
get_tag
(
"NH"
)),
(
"XS"
,
read
.
get_tag
(
"XS"
)),
(
"XT"
,
read
.
get_tag
(
"XT"
)))
headerFile
.
write
(
a
)
elif
len
(
validBc
)
==
18
:
bcRead
=
str
(
read
)
spRead
=
bcRead
.
split
(
"
\t
"
)
a
=
pysam
.
AlignedSegment
()
a
.
query_name
=
str
(
readSeq
[
0
]
+
":"
+
readSeq
[
1
]
+
":"
+
readSeq
[
2
]
+
":"
+
readSeq
[
3
]
+
":"
+
readSeq
[
4
]
+
":"
+
readSeq
[
5
]
+
":"
+
readSeq
[
6
]
+
"_"
+
validBc
+
"_"
+
readUmi
)
a
.
mapping_quality
=
int
(
spRead
[
4
])
a
.
query_sequence
=
spRead
[
9
]
a
.
flag
=
int
(
spRead
[
1
])
a
.
reference_start
=
int
(
spRead
[
3
])
a
.
reference_id
=
int
(
spRead
[
2
])
a
.
tags
=
((
"NH"
,
read
.
get_tag
(
"NH"
)),
(
"XS"
,
read
.
get_tag
(
"XS"
)))
headerFile
.
write
(
a
)
for
read
in
fileForDetection
.
fetch
(
until_eof
=
True
):
readSeq
=
read
.
qname
.
split
(
":"
)
rawBarcode
=
readSeq
[
7
]
print
(
rawBarcode
)
if
rawBarcode
.
startswith
(
"*"
):
continue
else
:
if
len
(
rawBarcode
)
==
26
:
print
(
"10X v2"
)
checkBarcode10xv2
()
break
elif
len
(
rawBarcode
)
==
28
:
print
(
"10X v3"
)
checkBarcode10xv3
()
break
else
:
print
(
"ddSeq"
)
checkddSeq
()
break
fileForDetection
.
close
()
bamFiles
.
close
()
# Reading-in each read from BAM file
# for read in bamFiles.fetch(until_eof=True):
# Splitting barcode of read
# readSeq = read.qname.split(":")
# rawBarcode = readSeq[7]
#print(readSeq)
#print(rawBarcode)
#break
#if rawBarcode.startswith("*"):
# # Determine position of linker 1
# posLinker1 = linkerPos(rawBarcode, linker1)
# #print posLinker1
#
# # Position of linker 1 has to be bigger than 6, for BC1 to be complete
# if posLinker1[1] > 5: #and int(posLinker1[1])<14:
# #print(posLinker1)
#
# # Determine position of linker 2
# posLinker2 = linkerPos(rawBarcode, linker2)
# #print(posLinker2)
# #if posLinker2[0]:
# #print(posLinker2)
#
# # Position of linker 2 has to be exactly position of linker 1 + 21 bases (no insertion or deletion allowed)
# if int(posLinker1[1]) + 21 == int(posLinker2[1]):
# #print(posLinker2)
#
# # Cutting of phase block at the beginning of the read
# matchLinker = rawBarcode[int(posLinker1[1])-7:]
# #print(matchLinker)
#
# # Cutting of access bases at the end of the read
# matchLinker = matchLinker[:int(posLinker2[1]+34)]
# #print(matchLinker)
#
# # Read has to be at least 62 bases to be complete
# if len(matchLinker) >= 62:
# #print(matchLinker)
#
# # Extracting ACG and GAC sequences
# acg = matchLinker[48:51]
# #print(acg)
# gac = matchLinker[59:62]
# #print(gac)
#
# # Controlling sequences while allowing 1 ED
# if Levenshtein.hamming(acg, "ACG") <=1 and Levenshtein.hamming(gac, "GAC") <= 1:
# #print(matchLinker)
#
# # Comparing barcodes with witelist and extracting UMI sequence
# bc1 = validBarcodes(matchLinker[0:6], barcodeList)
# #print bc1
# bc2 = validBarcodes(matchLinker[21:27], barcodeList)
# #print bc2
# bc3 = validBarcodes(matchLinker[42:48], barcodeList)
# #print bc3
# validBc = bc1 + bc2 + bc3
# #print validBc
# readUmi = matchLinker[51:59]
# #print(validBc + "\t" + readUmi)
#
# # Composed barcodes have to have a length of exactly 18 bases
# if len(validBc) == 18 and "Assigned" in str(read):
#
# bcRead = str(read)
# spRead = bcRead.split("\t")
#
# a = pysam.AlignedSegment()
# a.query_name = str(readSeq[0] + ":" + readSeq[1] + ":" + readSeq[2] + ":" + readSeq[3] + ":" + readSeq[4] + ":" + readSeq[5] + ":" + readSeq[6] + "_" + validBc + "_" + readUmi)
# a.mapping_quality = int(spRead[4])
# a.query_sequence = spRead[9]
# a.flag = int(spRead[1])
# a.reference_start = int(spRead[3])
# a.reference_id = int(spRead[2])
# a.tags = (("NH", read.get_tag("NH")), ("XS", read.get_tag("XS")), ("XT", read.get_tag("XT")))
#
# headerFile.write(a)
#
# elif len(validBc) == 18:
#
# bcRead = str(read)
# spRead = bcRead.split("\t")
#
# a = pysam.AlignedSegment()
# a.query_name = str(readSeq[0] + ":" + readSeq[1] + ":" + readSeq[2] + ":" + readSeq[3] + ":" + readSeq[4] + ":" + readSeq[5] + ":" + readSeq[6] + "_" + validBc + "_" + readUmi)
# a.mapping_quality = int(spRead[4])
# a.query_sequence = spRead[9]
# a.flag = int(spRead[1])
# a.reference_start = int(spRead[3])
# a.reference_id = int(spRead[2])
# a.tags = (("NH", read.get_tag("NH")), ("XS", read.get_tag("XS")))
#
# headerFile.write(a)
# bamFiles.close()
Scripts/fcMetrics.py
0 → 100755
View file @
6681c791
#!/usr/bin/env python3
import
sys
import
json
import
os
import
pysam
# Reading file in
workingFile
=
pysam
.
AlignmentFile
(
sys
.
argv
[
1
],
"rb"
)
workingSample
=
sys
.
argv
[
2
]
# Dictionary
barcodeDict
=
{}
data
=
{}
# Cellbarcode stats
data
[
"Cell_barcode"
]
=
[]
data
[
"Assigned"
]
=
[]
data
[
"Unassigned_unmapped"
]
=
[]
data
[
"Unassigned_no_features"
]
=
[]
data
[
"Unassigned_ambiguity"
]
=
[]
data
[
"Unassigned_multimapping"
]
=
[]
# Opening file
# Read every line
for
line
in
workingFile
.
fetch
(
until_eof
=
True
):
# Taking the barcode and splitting the read
cellBc
=
line
.
query_name
.
split
(
"_"
)
read
=
str
(
line
).
strip
().
split
(
"
\t
"
)
# Write each barcode in a dictionary as key if it is not a multimapping with the assignment
if
int
(
read
[
1
])
!=
256
and
int
(
read
[
1
])
!=
272
:
assigTag
=
line
.
get_tag
(
"XS"
)
if
cellBc
[
1
]
in
barcodeDict
.
keys
():
barcodeDict
[
cellBc
[
1
]].
append
(
assigTag
)
else
:
barcodeDict
[
cellBc
[
1
]]
=
[
assigTag
]
#print(barcodeDict)
# Counting metrics for each cell barcode
for
cellBar
in
barcodeDict
.
keys
():
assigned
=
0
unmapped
=
0
noFeatures
=
0
ambiguity
=
0
multimapping
=
0
for
element
in
barcodeDict
[
cellBar
]:
# Counting assigned reads
if
"Assigned"
in
element
:
assigned
=
assigned
+
1
#print(element)
# Counting unmapped reads
elif
"Unmapped"
in
element
:
unmapped
=
unmapped
+
1
#print(element)
# Counting reads with no features
elif
"NoFeatures"
in
element
:
noFeatures
=
noFeatures
+
1
#print(element)
# Counting ambiguous reads
elif
"Ambiguity"
in
element
:
ambiguity
=
ambiguity
+
1
#print(element)
# Counting multimapping
else
:
multimapping
=
multimapping
+
1
#print(element)
# Saving all metrics for each cellbarcode in directory
if
not
int
(
assigned
)
==
0
:
data
[
"Cell_barcode"
].
append
(
workingSample
+
"_Zelle_"
+
str
(
cellBar
))
data
[
"Assigned"
].
append
(
int
(
assigned
))
data
[
"Unassigned_unmapped"
].
append
(
int
(
unmapped
))
data
[
"Unassigned_no_features"
].
append
(
int
(
noFeatures
))
data
[
"Unassigned_ambiguity"
].
append
(
int
(
ambiguity
))
data
[
"Unassigned_multimapping"
].
append
(
int
(
multimapping
))
# Writing directory in JSON file
workingDire
=
os
.
path
.
join
(
"Results/"
+
workingSample
+
"/JSON_Files/"
+
workingSample
+
"_Gene_Counts.json"
)
with
open
(
workingDire
,
"w"
)
as
wD
:
json
.
dump
(
data
,
wD
,
indent
=
2
)
Scripts/gene_counting_json.py
0 → 100755
View file @
6681c791
#!/usr/bin/env python3
# Importing modules
import
sys
import
os
import
json
# Reading-in BAM file
sample
=
sys
.
argv
[
1
]
sampleName
=
sys
.
argv
[
2
]
# Dictionary for stats
data
=
{}
data
[
'Sample'
]
=
[]
data
[
'Assigned'
]
=
[]
data
[
'Unassigned_unmapped'
]
=
[]
data
[
'Unassigned_multimapping'
]
=
[]
data
[
'Unassigned_no_features'
]
=
[]
data
[
'Unassigned_ambiguity'
]
=
[]
# Opening file and reading-in lines
if
sample
.
endswith
(
".summary"
):
with
open
(
sample
,
'r'
)
as
wF
:
for
line
in
wF
:
# Number of assigned reads
if
line
.
startswith
(
"Assigned"
):
assigned
=
line
.
strip
().
rsplit
(
"
\t
"
)
# Number of unmapped reads
elif
"Unassigned_Unmapped"
in
line
:
unmapped
=
line
.
strip
().
rsplit
(
"
\t
"
)
# Number of unassigned reads: MultiMapping
elif
"Unassigned_MultiMapping"
in
line
:
unassiMulti
=
line
.
strip
().
rsplit
(
"
\t
"
)
# Number of unassigned reads: NoFeatures
elif
"Unassigned_NoFeatures"
in
line
:
unassiNoFeat
=
line
.
strip
().
rsplit
(
"
\t
"
)
# Number of unassigned reads: Ambiguity
elif
"Unassigned_Ambiguity"
in
line
:
unassiAmbig
=
line
.
strip
().
rsplit
(
"
\t
"
)
# Writing stats in dictionary
data
[
'Sample'
]
=
sampleName
data
[
'Assigned'
]
=
int
(
assigned
[
1
])
data
[
'Unassigned_unmapped'
]
=
int
(
unmapped
[
1
])
data
[
'Unassigned_multimapping'
]
=
int
(
unassiMulti
[
1
])
data
[
'Unassigned_no_features'
]
=
int
(
unassiNoFeat
[
1
])
data
[
'Unassigned_ambiguity'
]
=
int
(
unassiAmbig
[
1
])
# Path for working directory and file name
workingDire
=
os
.
path
.
join
(
"Results/"
+
sampleName
+
"/JSON_Files/"
+
sampleName
+
"_Gene_Counts_Sample.json"
)
# Writing dictionary in JSON file
with
open
(
workingDire
,
'w'
)
as
working
: