Commit 6681c791 authored by ahoek's avatar ahoek
Browse files

Add neccessary files for WASP Snakemake workflow

parent d67f962e
#!/usr/bin/env python3
import sys
import json
import os
import pysam
# Reading files
cellcounter = sys.argv[1]
mapping = sys.argv[2]
validBc = pysam.AlignmentFile(sys.argv[3], 'rb')
sampleName = sys.argv[4]
# Opening file with number of counted cell barcodes
with open(cellcounter, 'r') as cFile:
# Getting the number of counted cell barcodes
cellbarcodes = 0
cellbarcodes = cFile.readline().strip().split("\t")
#print(cellbarcodes[1])
# Opening STAR log file
with open(mapping, 'r') as mFile:
rawReads=0
# Reading number of raw reads
for count, line in enumerate(mFile):
if count == 5:
rawReads = line.strip().split("\t")
#print(rawReads)
# Opening file with all reads with valid barcodes
# Reading every line
bcReads = 0
for line in validBc.fetch(until_eof=True):
read = str(line).split("\t")
#print(read[1])
# Check if read is not a multimapping
if int(read[1]) != 256 and int(read[1]) != 272:
bcReads = bcReads + 1
#print(bcReads)
# Saving all stats in a dictionary
sampleCells = {"Sample":sampleName, "Cell_barcodes":int(cellbarcodes[1]), "Raw_reads":int(rawReads[1]), "Reads_with_valid_barcode":int(bcReads)}
#print(sampleCells)
# Creating a name for JSON file
jsonFile = os.path.join("Results/" + sampleName + "/JSON_Files/" + sampleName + "_Cell_Numbers.json")
# Writing the dictionary in a JSON file
with open(jsonFile, 'w') as jFile:
json.dump(sampleCells, jFile)
#!/usr/bin/env python3
# Import of modules
import pysam
import sys
import Levenshtein
import py
# Reading-in BAM file of featureCounts analysis
bamFiles = pysam.AlignmentFile(sys.argv[1], "rb")
fileForDetection = pysam.AlignmentFile(sys.argv[1], "rb")
headerFile = pysam.AlignmentFile(sys.argv[2], "wb", template=bamFiles)
bclist10xv2 = set(line.strip() for line in open('Scripts/whitelist_10X_737K-august-2016.txt'))
bclist10xv3 = set(line.strip() for line in open('Scripts/whitelist_10X_3M-february-2018.txt'))
# Barcode whitelist
barcodeList = ["AACAGC", "ATACTT", "CCTCTA", "AACGTG", "ATAGCG", "CGAAAG", "GAGGCC", "GTACAG", "TCTAGC", "AAGCCA", "ATATAC", "CGAGCA", "GAGTGA", "AAAGAA", "AGTCTG", "GAGCTT", "AAGTAT", "ATCCGG", "CGCATA", "GATCAA", "AATTGG", "ATGAAG", "ACAAGG", "ATTAGT", "CCGTAA", "GACTCG", "GGTAGG", "TCGCCT", "GGTGCT", "TCGGGA", "GTCCTA", "TGAATT","GTCGGC", "TGAGAC", "CGGCGT", "GCCAGA", "GTGGTG", "TGCGGT", "CGGTCC", "GCCGTT", "GTTAAC", "TGCTAA", "GTTTCA", "TGGCAG", "ACCCAA", "CAACCG", "CGTTAT", "GCGAAT", "ACCTTC","GCGCGG", "TAAGCT", "TGTGTA", "ACGGAC", "CACCAC", "CTATTA", "GCTCCC", "TAATAG", "TGTTCG", "ACTGCA", "CACTGT","CTCAAT", "GCTGAG", "TACCGA", "TTAAGA", "AGACCC","CAGACT", "CTGTGG", "GCTTGT", "AGATGT", "CAAGTC", "CTAGGT", "CAGGAG", "CTTACG", "AGCACG", "CATAGA", "CTTGAA", "TAGAGG", "TTCGCA", "GGACGA", "TATTTC", "TTCTTG", "GGATTG", "TCAGTG", "TTGCTC","GGCCAT", "TCATCA", "TTGGAT", "AGGTTA", "CCACGC", "GAAATA", "AGTAAA", "CCGATG", "GAAGGG", "GGGATC", "TCCAAG", "TTTGGG"]
# Function to compare two sequences, allowing 1 ED
linker1 = 'TAGCCATCGCATTGC'
linker2 = 'TACCTCTGAGCTGAA'
def linkerPos(read, linker):
min = 1
pos = 0
hit = False
for i in range(len(read) - len(linker)+1):
substr = read[i:i+len(linker)]
hamDist = Levenshtein.hamming(substr, linker)
if hamDist <= min:
min = hamDist
pos = i+1
hit = True
return (hit,pos)
# Function to compare barcodes of read with barcodes of whitelist
def validBarcodes(rbc, bcList):
newBarcode = ""
for bcL in bcList:
if Levenshtein.hamming(rbc, bcL) <= 1:
newBarcode = bcL
return newBarcode
def checkBarcode10xv2():
for read in bamFiles.fetch(until_eof=True):
# Splitting barcode of read
readSeq = read.qname.split(":")
rawBarcode = readSeq[7]
rawBarcode = rawBarcode.strip()
readBC = rawBarcode[0:16]
#if not rawBarcode.startswith("*"):
if readBC in bclist10xv2:
validBc = readBC
readUmi = rawBarcode[16:26]
if "Assigned" in str(read):
bcRead = str(read)
spRead = bcRead.split("\t")
a = pysam.AlignedSegment()
a.query_name = str(
readSeq[0] + ":" + readSeq[1] + ":" + readSeq[2] + ":" + readSeq[3] + ":" + readSeq[4] + ":" +
readSeq[5] + ":" + readSeq[6] + "_" + validBc + "_" + readUmi)
a.mapping_quality = int(spRead[4])
a.query_sequence = spRead[9]
a.flag = int(spRead[1])
a.reference_start = int(spRead[3])
a.reference_id = int(spRead[2])
a.tags = (("NH", read.get_tag("NH")), ("XS", read.get_tag("XS")), ("XT", read.get_tag("XT")))
headerFile.write(a)
else:
bcRead = str(read)
spRead = bcRead.split("\t")
a = pysam.AlignedSegment()
a.query_name = str(
readSeq[0] + ":" + readSeq[1] + ":" + readSeq[2] + ":" + readSeq[3] + ":" + readSeq[4] + ":" +
readSeq[5] + ":" + readSeq[6] + "_" + validBc + "_" + readUmi)
a.mapping_quality = int(spRead[4])
a.query_sequence = spRead[9]
a.flag = int(spRead[1])
a.reference_start = int(spRead[3])
a.reference_id = int(spRead[2])
a.tags = (("NH", read.get_tag("NH")), ("XS", read.get_tag("XS")))
headerFile.write(a)
def checkBarcode10xv3():
for read in bamFiles.fetch(until_eof=True):
# Splitting barcode of read
readSeq = read.qname.split(":")
rawBarcode = readSeq[7]
rawBarcode = rawBarcode.strip()
readBC = rawBarcode[0:16]
#if not rawBarcode.startswith("*"):
if readBC in bclist10xv3:
validBc = readBC
readUmi = rawBarcode[16:29]
if "Assigned" in str(read):
bcRead = str(read)
spRead = bcRead.split("\t")
a = pysam.AlignedSegment()
a.query_name = str(readSeq[0] + ":" + readSeq[1] + ":" + readSeq[2] + ":" + readSeq[3] + ":" + readSeq[4] + ":" + readSeq[5] + ":" + readSeq[6] + "_" + validBc + "_" + readUmi)
a.mapping_quality = int(spRead[4])
a.query_sequence = spRead[9]
a.flag = int(spRead[1])
a.reference_start = int(spRead[3])
### FIX FOR PYSAM-VERSION 0.17.0 ###
# Changes spRead[2] from "number" to "#number" e.g. "2" to "#2"
# and causes an error when trying to convert to int
# Fix does not work with older pysam-versions!
tmp = spRead[2].replace("#","")
a.reference_id = int(tmp)
a.tags = (("NH", read.get_tag("NH")), ("XS", read.get_tag("XS")), ("XT", read.get_tag("XT")))
headerFile.write(a)
else:
bcRead = str(read)
spRead = bcRead.split("\t")
a = pysam.AlignedSegment()
a.query_name = str(readSeq[0] + ":" + readSeq[1] + ":" + readSeq[2] + ":" + readSeq[3] + ":" + readSeq[4] + ":" + readSeq[5] + ":" + readSeq[6] + "_" + validBc + "_" + readUmi)
a.mapping_quality = int(spRead[4])
a.query_sequence = spRead[9]
a.flag = int(spRead[1])
a.reference_start = int(spRead[3])
### FIX FOR PYSAM-VERSION 0.17.0 ###
# Changes spRead[2] from "number" to "#number" e.g. "2" to "#2"
# and causes an error when trying to convert to int
# Fix does not work with older pysam-versions!
tmp = spRead[2].replace("#","")
a.reference_id = int(tmp)
a.tags = (("NH", read.get_tag("NH")), ("XS", read.get_tag("XS")))
headerFile.write(a)
# print("Barcode: ", rawBarcode[0:17])
# print("UMI: ", rawBarcode[17:29])
def checkddSeq():
# Illumina algorithm follows:
# Reading-in each read from BAM file
for read in bamFiles.fetch(until_eof=True):
# Splitting barcode of read
readSeq = read.qname.split(":")
rawBarcode = readSeq[7]
# print(readSeq)
# Determine position of linker 1
posLinker1 = linkerPos(rawBarcode, linker1)
# print posLinker1
# Position of linker 1 has to be bigger than 6, for BC1 to be complete
if posLinker1[1] > 5: # and int(posLinker1[1])<14:
# print(posLinker1)
# Determine position of linker 2
posLinker2 = linkerPos(rawBarcode, linker2)
# print(posLinker2)
# if posLinker2[0]:
# print(posLinker2)
# Position of linker 2 has to be exactly position of linker 1 + 21 bases (no insertion or deletion allowed)
if int(posLinker1[1]) + 21 == int(posLinker2[1]):
# print(posLinker2)
# Cutting of phase block at the beginning of the read
matchLinker = rawBarcode[int(posLinker1[1]) - 7:]
# print(matchLinker)
# Cutting of access bases at the end of the read
matchLinker = matchLinker[:int(posLinker2[1] + 34)]
# print(matchLinker)
# Read has to be at least 62 bases to be complete
if len(matchLinker) >= 62:
# print(matchLinker)
# Extracting ACG and GAC sequences
acg = matchLinker[48:51]
# print(acg)
gac = matchLinker[59:62]
# print(gac)
# Controlling sequences while allowing 1 ED
if Levenshtein.hamming(acg, "ACG") <= 1 and Levenshtein.hamming(gac, "GAC") <= 1:
# print(matchLinker)
# Comparing barcodes with witelist and extracting UMI sequence
bc1 = validBarcodes(matchLinker[0:6], barcodeList)
# print bc1
bc2 = validBarcodes(matchLinker[21:27], barcodeList)
# print bc2
bc3 = validBarcodes(matchLinker[42:48], barcodeList)
# print bc3
validBc = bc1 + bc2 + bc3
# print validBc
readUmi = matchLinker[51:59]
# print(validBc + "\t" + readUmi)
# Composed barcodes have to have a length of exactly 18 bases
if len(validBc) == 18 and "Assigned" in str(read):
bcRead = str(read)
spRead = bcRead.split("\t")
a = pysam.AlignedSegment()
a.query_name = str(
readSeq[0] + ":" + readSeq[1] + ":" + readSeq[2] + ":" + readSeq[3] + ":" + readSeq[
4] + ":" + readSeq[5] + ":" + readSeq[6] + "_" + validBc + "_" + readUmi)
a.mapping_quality = int(spRead[4])
a.query_sequence = spRead[9]
a.flag = int(spRead[1])
a.reference_start = int(spRead[3])
a.reference_id = int(spRead[2])
a.tags = (
("NH", read.get_tag("NH")), ("XS", read.get_tag("XS")), ("XT", read.get_tag("XT")))
headerFile.write(a)
elif len(validBc) == 18:
bcRead = str(read)
spRead = bcRead.split("\t")
a = pysam.AlignedSegment()
a.query_name = str(
readSeq[0] + ":" + readSeq[1] + ":" + readSeq[2] + ":" + readSeq[3] + ":" + readSeq[
4] + ":" + readSeq[5] + ":" + readSeq[6] + "_" + validBc + "_" + readUmi)
a.mapping_quality = int(spRead[4])
a.query_sequence = spRead[9]
a.flag = int(spRead[1])
a.reference_start = int(spRead[3])
a.reference_id = int(spRead[2])
a.tags = (("NH", read.get_tag("NH")), ("XS", read.get_tag("XS")))
headerFile.write(a)
for read in fileForDetection.fetch(until_eof=True):
readSeq = read.qname.split(":")
rawBarcode = readSeq[7]
print(rawBarcode)
if rawBarcode.startswith("*"):
continue
else:
if len(rawBarcode) == 26:
print("10X v2")
checkBarcode10xv2()
break
elif len(rawBarcode) == 28:
print("10X v3")
checkBarcode10xv3()
break
else:
print("ddSeq")
checkddSeq()
break
fileForDetection.close()
bamFiles.close()
# Reading-in each read from BAM file
# for read in bamFiles.fetch(until_eof=True):
# Splitting barcode of read
# readSeq = read.qname.split(":")
# rawBarcode = readSeq[7]
#print(readSeq)
#print(rawBarcode)
#break
#if rawBarcode.startswith("*"):
# # Determine position of linker 1
# posLinker1 = linkerPos(rawBarcode, linker1)
# #print posLinker1
#
# # Position of linker 1 has to be bigger than 6, for BC1 to be complete
# if posLinker1[1] > 5: #and int(posLinker1[1])<14:
# #print(posLinker1)
#
# # Determine position of linker 2
# posLinker2 = linkerPos(rawBarcode, linker2)
# #print(posLinker2)
# #if posLinker2[0]:
# #print(posLinker2)
#
# # Position of linker 2 has to be exactly position of linker 1 + 21 bases (no insertion or deletion allowed)
# if int(posLinker1[1]) + 21 == int(posLinker2[1]):
# #print(posLinker2)
#
# # Cutting of phase block at the beginning of the read
# matchLinker = rawBarcode[int(posLinker1[1])-7:]
# #print(matchLinker)
#
# # Cutting of access bases at the end of the read
# matchLinker = matchLinker[:int(posLinker2[1]+34)]
# #print(matchLinker)
#
# # Read has to be at least 62 bases to be complete
# if len(matchLinker) >= 62:
# #print(matchLinker)
#
# # Extracting ACG and GAC sequences
# acg = matchLinker[48:51]
# #print(acg)
# gac = matchLinker[59:62]
# #print(gac)
#
# # Controlling sequences while allowing 1 ED
# if Levenshtein.hamming(acg, "ACG") <=1 and Levenshtein.hamming(gac, "GAC") <= 1:
# #print(matchLinker)
#
# # Comparing barcodes with witelist and extracting UMI sequence
# bc1 = validBarcodes(matchLinker[0:6], barcodeList)
# #print bc1
# bc2 = validBarcodes(matchLinker[21:27], barcodeList)
# #print bc2
# bc3 = validBarcodes(matchLinker[42:48], barcodeList)
# #print bc3
# validBc = bc1 + bc2 + bc3
# #print validBc
# readUmi = matchLinker[51:59]
# #print(validBc + "\t" + readUmi)
#
# # Composed barcodes have to have a length of exactly 18 bases
# if len(validBc) == 18 and "Assigned" in str(read):
#
# bcRead = str(read)
# spRead = bcRead.split("\t")
#
# a = pysam.AlignedSegment()
# a.query_name = str(readSeq[0] + ":" + readSeq[1] + ":" + readSeq[2] + ":" + readSeq[3] + ":" + readSeq[4] + ":" + readSeq[5] + ":" + readSeq[6] + "_" + validBc + "_" + readUmi)
# a.mapping_quality = int(spRead[4])
# a.query_sequence = spRead[9]
# a.flag = int(spRead[1])
# a.reference_start = int(spRead[3])
# a.reference_id = int(spRead[2])
# a.tags = (("NH", read.get_tag("NH")), ("XS", read.get_tag("XS")), ("XT", read.get_tag("XT")))
#
# headerFile.write(a)
#
# elif len(validBc) == 18:
#
# bcRead = str(read)
# spRead = bcRead.split("\t")
#
# a = pysam.AlignedSegment()
# a.query_name = str(readSeq[0] + ":" + readSeq[1] + ":" + readSeq[2] + ":" + readSeq[3] + ":" + readSeq[4] + ":" + readSeq[5] + ":" + readSeq[6] + "_" + validBc + "_" + readUmi)
# a.mapping_quality = int(spRead[4])
# a.query_sequence = spRead[9]
# a.flag = int(spRead[1])
# a.reference_start = int(spRead[3])
# a.reference_id = int(spRead[2])
# a.tags = (("NH", read.get_tag("NH")), ("XS", read.get_tag("XS")))
#
# headerFile.write(a)
# bamFiles.close()
#!/usr/bin/env python3
import sys
import json
import os
import pysam
# Reading file in
workingFile = pysam.AlignmentFile(sys.argv[1], "rb")
workingSample = sys.argv[2]
# Dictionary
barcodeDict = {}
data = {}
# Cellbarcode stats
data["Cell_barcode"] = []
data["Assigned"] = []
data["Unassigned_unmapped"] = []
data["Unassigned_no_features"] = []
data["Unassigned_ambiguity"] = []
data["Unassigned_multimapping"] = []
# Opening file
# Read every line
for line in workingFile.fetch(until_eof=True):
# Taking the barcode and splitting the read
cellBc = line.query_name.split("_")
read = str(line).strip().split("\t")
# Write each barcode in a dictionary as key if it is not a multimapping with the assignment
if int(read[1]) != 256 and int(read[1]) != 272:
assigTag = line.get_tag("XS")
if cellBc[1] in barcodeDict.keys():
barcodeDict[cellBc[1]].append(assigTag)
else:
barcodeDict[cellBc[1]] = [assigTag]
#print(barcodeDict)
# Counting metrics for each cell barcode
for cellBar in barcodeDict.keys():
assigned = 0
unmapped = 0
noFeatures = 0
ambiguity = 0
multimapping = 0
for element in barcodeDict[cellBar]:
# Counting assigned reads
if "Assigned" in element:
assigned = assigned + 1
#print(element)
# Counting unmapped reads
elif "Unmapped" in element:
unmapped = unmapped + 1
#print(element)
# Counting reads with no features
elif "NoFeatures" in element:
noFeatures = noFeatures + 1
#print(element)
# Counting ambiguous reads
elif "Ambiguity" in element:
ambiguity = ambiguity + 1
#print(element)
# Counting multimapping
else:
multimapping = multimapping + 1
#print(element)
# Saving all metrics for each cellbarcode in directory
if not int(assigned) == 0:
data["Cell_barcode"].append(workingSample + "_Zelle_" + str(cellBar))
data["Assigned"].append(int(assigned))
data["Unassigned_unmapped"].append(int(unmapped))
data["Unassigned_no_features"].append(int(noFeatures))
data["Unassigned_ambiguity"].append(int(ambiguity))
data["Unassigned_multimapping"].append(int(multimapping))
# Writing directory in JSON file
workingDire = os.path.join("Results/" + workingSample + "/JSON_Files/" + workingSample + "_Gene_Counts.json")
with open(workingDire, "w") as wD:
json.dump(data, wD, indent=2)
#!/usr/bin/env python3
# Importing modules
import sys
import os
import json
# Reading-in BAM file
sample = sys.argv[1]
sampleName = sys.argv[2]
# Dictionary for stats
data = {}
data['Sample'] = []
data['Assigned'] = []
data['Unassigned_unmapped'] = []
data['Unassigned_multimapping'] = []
data['Unassigned_no_features'] = []
data['Unassigned_ambiguity'] = []
# Opening file and reading-in lines
if sample.endswith(".summary"):
with open (sample, 'r') as wF:
for line in wF:
# Number of assigned reads
if line.startswith("Assigned"):
assigned = line.strip().rsplit("\t")
# Number of unmapped reads
elif "Unassigned_Unmapped" in line:
unmapped = line.strip().rsplit("\t")
# Number of unassigned reads: MultiMapping
elif "Unassigned_MultiMapping" in line:
unassiMulti = line.strip().rsplit("\t")
# Number of unassigned reads: NoFeatures
elif "Unassigned_NoFeatures" in line:
unassiNoFeat = line.strip().rsplit("\t")
# Number of unassigned reads: Ambiguity
elif "Unassigned_Ambiguity" in line:
unassiAmbig = line.strip().rsplit("\t")
# Writing stats in dictionary
data['Sample'] = sampleName
data['Assigned'] = int(assigned[1])
data['Unassigned_unmapped'] = int(unmapped[1])
data['Unassigned_multimapping'] = int(unassiMulti[1])
data['Unassigned_no_features'] = int(unassiNoFeat[1])
data['Unassigned_ambiguity'] = int(unassiAmbig[1])
# Path for working directory and file name
workingDire = os.path.join("Results/" + sampleName + "/JSON_Files/" + sampleName + "_Gene_Counts_Sample.json")
# Writing dictionary in JSON file
with open (workingDire, 'w') as working: