Commit b112d79b authored by lmueller's avatar lmueller
Browse files

modefied retriever scripts so invalid dbxrefs do not cause a crash

parent f4475346
......@@ -23,7 +23,7 @@ def resolve(strings, check_existence=True):
for s in strings:
status = STATUS_NOT_CHECKED
if check_existence:
status = check_dbxref_exists(s)
status = check_dbxref_exists(s)
dbxref = convert_string_to_dbxref(s)
if dbxref['db'] in providers:
provider = providers[dbxref['db']]
......@@ -51,7 +51,7 @@ def check_dbxref_exists(string):
return exists
else:
return STATUS_CHECK_NOT_SUPPORTED
return STATUS_UNSUPPORTED_DB
return STATUS_UNSUPPORTED_DB
def compile_url(template, dbxref):
return template.replace('%i', dbxref['id']).replace('%d', dbxref['db'])
......
- name: Enzyme
prefixes: ["EC"]
prefixes: ["EC", "ec"]
resources:
html: ["https://enzyme.expasy.org/EC/%i"]
text: ["https://enzyme.expasy.org/EC/%i.txt"]
......@@ -8,14 +8,14 @@
type: 'external'
location: 'scripts/retrieve_enzyme.py'
- name: Gene Identifier
prefixes: [GI]
prefixes: ["GI", "gi"]
resources:
html: ["http://www.ncbi.nlm.nih.gov/protein/GI:%i"]
xml: ["http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"]
# does not work
# check_existence: "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=protein&dopt=xml&sort=&val=%i&retmode=file"
- name: Uniprot
prefixes: ["UniProtKB/TrEMBL", "UniProtKB/Swiss-Prot"]
prefixes: ["UniProtKB/TrEMBL", "UniProtKB/Swiss-Prot", "uniprotkb/trembl", "uniprotkb/swiss-prot"]
resources:
html: [ "http://www.uniprot.org/uniprot/%i"]
xml: [ "http://www.uniprot.org/uniprot/%i.xml"]
......@@ -30,7 +30,7 @@
xml: ["http://www.uniprot.org/taxonomy/%i.rdf"]
check_existence: "http://www.uniprot.org/taxonomy/%i"
- name: SequenceOntology
prefixes: ["SO"]
prefixes: ["SO", "so"]
resources:
html: ["http://www.sequenceontology.org/browser/current_svn/term/SO:%i"]
obo: ["http://www.sequenceontology.org/browser/current_svn/export/term_only/obo/SO:%i"]
......@@ -41,7 +41,7 @@
type: 'external'
location: 'scripts/retrieve_sequence_ontology.py'
- name: RFAM
prefixes: ["RFAM"]
prefixes: ["RFAM", "rfam"]
resources:
html: ["http://rfam.xfam.org/family/%i"]
xml: ["http://rfam.xfam.org/family/%i?content-type=text%2Fxml"]
......@@ -53,7 +53,7 @@
html: ["http://www.ncbi.nlm.nih.gov/pubmed/%i"]
check_existence: "http://www.ncbi.nlm.nih.gov/pubmed/%i"
- name: Protein Families
prefixes: ["PFAM"]
prefixes: ["PFAM", "pfam"]
resources:
html: ["http://pfam.xfam.org/family/%i"]
xml: ["http://pfam.xfam.org/family/%i?output=xml"]
......@@ -63,26 +63,26 @@
type: 'external'
location: 'scripts/retrieve_pfam.py'
- name: PDB
prefixes: ["PDB"]
prefixes: ["PDB", "pdb"]
resources:
html: ["http://www.rcsb.org/pdb/explore/explore.do?structureId=%i"]
xml: ["http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=xml&compression=NO&structureId=%i"]
check_existence: "http://www.rcsb.org/pdb/download/downloadFile.do?fileFormat=xml&compression=NO&structureId=%i"
- name: InterPro
prefixes: ["InterPro"]
prefixes: ["InterPro", "interpro"]
resources:
html: ["http://www.ebi.ac.uk/interpro/entry/%i"]
# does not work
# check_existence: "http://www.ebi.ac.uk/interpro/entry/%i"
- name: GeneID
prefixes: ["GeneID"]
prefixes: ["GeneID", "geneid"]
resources:
html: ["http://www.ncbi.nlm.nih.gov/gene/%i"]
xml: ["http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=gene&dopt=xml&sort=&val=%i&retmode=file"]
# does not work
# check_existence: "http://www.ncbi.nlm.nih.gov/sviewer/viewer.cgi?tool=portal&db=gene&dopt=xml&sort=&val=%i&retmode=file"
- name: Gene Ontology
prefixes: ["GO"]
prefixes: ["GO", "go"]
resources:
html: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i"]
xml: ["http://www.ebi.ac.uk/QuickGO/GTerm?id=GO:%i&format=oboxml"]
......
......@@ -25,13 +25,16 @@ def main():
logger.debug('Content: %s', r.text)
d = json.loads(r.text)
output = {'id': entry['dbxref']}
if args.basic:
output.update(read_basic(d))
if args.relations:
output.update(read_relations(d))
if not args.basic and not args.relations:
output.update(read_basic(d))
output.update(read_relations(d))
if not 'messages' in d:
if args.basic:
output.update(read_basic(d))
if args.relations:
output.update(read_relations(d))
if not args.basic and not args.relations:
output.update(read_basic(d))
output.update(read_relations(d))
else:
output['messages'] = d['messages']
documents.append(output)
print (json.dumps(documents))
......
......@@ -38,15 +38,16 @@ def main():
elements = []
else:
elements.append(line.strip())
d = resolve_elements(elements)
output = {'id': entry['dbxref']}
if 'id' in d and d['id'] == entry['dbxref'] and args.basic:
output.update(format_output(d))
if args.relations:
output['relations'] = resolve_relations(entry)
if 'id' in d and d['id'] == entry['dbxref'] and not args.basic and not args.relations:
output.update(format_output(d))
output['relations'] = resolve_relations(entry)
if not '<title>500 Internal Server Error</title>' in elements:
d = resolve_elements(elements)
if 'id' in d and d['id'] == entry['dbxref'] and args.basic:
output.update(format_output(d))
if args.relations:
output['relations'] = resolve_relations(entry)
if 'id' in d and d['id'] == entry['dbxref'] and not args.basic and not args.relations:
output.update(format_output(d))
output['relations'] = resolve_relations(entry)
documents.append(output)
print (json.dumps(documents))
......
......@@ -37,21 +37,24 @@ def main():
logger.debug('URL: %s', xml_url)
r = requests.get(xml_url)
logger.debug('Content: %s', r.text)
root = ET.fromstring(r.text)
output = {'id': entry['dbxref']}
for child in root.findall('uniprot:entry', ns):
if args.basic:
output.update(read_basic(child))
if args.sequence:
output.update(read_sequence(child))
if args.organism:
output.update(read_taxonomy(child))
if args.annotation:
output.update(read_annotation(child))
if args.features:
output['features'] = read_features(child)
try:
root = ET.fromstring(r.text)
for child in root.findall('uniprot:entry', ns):
if args.basic:
output.update(read_basic(child))
if args.sequence:
output.update(read_sequence(child))
if args.organism:
output.update(read_taxonomy(child))
if args.annotation:
output.update(read_annotation(child))
if args.features:
output['features'] = read_features(child)
except:
pass
documents.append(output)
print(json.dumps(documents))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment