Skip to content

Commit 0877984

Browse files
Merge branch 'release3' of github.com:micheldumontier/bio2rdf-scripts into release3
2 parents d34050b + d6d205e commit 0877984

File tree

11 files changed

+120
-83
lines changed

11 files changed

+120
-83
lines changed

README.md

Lines changed: 0 additions & 26 deletions
This file was deleted.

goa/goa.php

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ class GOAParser extends Bio2RDFizer
3636
{
3737
function __construct($argv) {
3838
parent::__construct($argv,"goa");
39-
parent::addParameter('files',true,'all|arabidopsis|chicken|cow|dicty|dog|fly|human|mouse|pdb|pig|rat|uniprot|worm|yeast|zebrafish','all','all or comma-separated list of files to process');
39+
// parent::addParameter('files',true,'all|arabidopsis|chicken|cow|dicty|dog|fly|human|mouse|pdb|pig|rat|uniprot|worm|yeast|zebrafish','all','all or comma-separated list of files to process');
40+
parent::addParameter('files',true,'all|arabidopsis|chicken|cow|dicty|dog|fly|human|mouse|pig|rat|worm|yeast|zebrafish','all','all or comma-separated list of files to process');
4041
parent::addParameter('download_url',false,null,'ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/');
4142
parent::initialize();
4243
}
@@ -161,9 +162,13 @@ function process($file){
161162

162163
//entity id
163164
$eid = $this->getdbURI($db,$id);
165+
if(!$eid) {
166+
print_r($fields);
167+
continue;
168+
}
164169
parent::addRDF(
165170
parent::describeIndividual($eid,$label,parent::getVoc()."GO-Annotation").
166-
parent::describeClass(parent::getVoc()."GO-Annotation","GO Annotation").
171+
parent::describeClass(parent::getVoc()."GO-Annotation","GO Annotation").
167172
parent::triplifyString($eid,parent::getVoc()."symbol",$symbol)
168173
);
169174
parent::addRDF(
@@ -261,6 +266,8 @@ function getdbURI($db_id, $db_object_id){
261266
} else if ($db_id == "PDB"){
262267
$split_object = explode("_", $db_object_id);
263268
$returnMe = "pdb:".$split_object[0]."/chain_".$split_object[1];
269+
} else {
270+
$returnMe = $db_id.":".$db_object_id;
264271
}
265272
return $returnMe;
266273
}

hgnc/hgnc.php

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -117,11 +117,13 @@ function Run(){
117117
function process(){
118118
$header = $this->GetReadFile()->Read(200000);
119119
$header_arr = explode("\t", $header);
120-
121-
if (count($header_arr) != 40)
120+
$n = 41;
121+
$c = count($header_arr);
122+
if ($c != $n)
122123
{
123124
echo PHP_EOL;
124-
trigger_error ("Header format is different than expected, please update the script",E_USER_ERROR);
125+
print_r($header_arr);
126+
trigger_error ("Expected $n columns, found $c . please update the script",E_USER_ERROR);
125127
exit;
126128
}
127129

@@ -164,9 +166,10 @@ function process(){
164166
$refseq_mappeddatasuppliedbyNCBI = $fields[34];
165167
$uniprot_id_mappeddatasuppliedbyUniProt = $fields[35];
166168
$ensembl_id_mappeddatasuppliedbyEnsembl = $fields[36];
167-
$ucsc_id_mappeddatasuppliedbyUCSC = $fields[37];
168-
$mouse_genome_database_id_mappeddatasuppliedbyMGI = $fields[38];
169-
$rat_genome_database_id_mappeddatasuppliedbyRGD = $fields[39];
169+
$vega_id_mappeddatasuppliedbyVega = $fields[37];
170+
$ucsc_id_mappeddatasuppliedbyUCSC = $fields[38];
171+
$mouse_genome_database_id_mappeddatasuppliedbyMGI = $fields[39];
172+
$rat_genome_database_id_mappeddatasuppliedbyRGD = $fields[40];
170173

171174
$id_res = $id;
172175
$id_label = "Gene Symbol for ".$approved_symbol;
@@ -464,6 +467,15 @@ function process(){
464467
}
465468
}
466469

470+
if(!empty($ucsc_id_mappeddatasuppliedbyVega)){
471+
$ucsc_id_mappeddatasuppliedbyVega = explode(", ", $ucsc_id_mappeddatasuppliedbyVega);
472+
foreach ($ucsc_id_mappeddatasuppliedbyVega as $vega_id) {
473+
parent::AddRDF(
474+
parent::triplify($id_res, $this->getVoc()."x-vega", "vega:".trim($vega_id)).
475+
parent::describeProperty($this->getVoc()."x-vega", "Vega entry")
476+
);
477+
}
478+
}
467479
if(!empty($ucsc_id_mappeddatasuppliedbyUCSC)){
468480
$ucsc_id_mappeddatasuppliedbyUCSC = explode(", ", $ucsc_id_mappeddatasuppliedbyUCSC);
469481
foreach ($ucsc_id_mappeddatasuppliedbyUCSC as $ucsc_id) {

irefindex/irefindex.php

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ class irefindexParser extends Bio2RDFizer
3333
{
3434
function __construct($argv) { //
3535
parent::__construct($argv,"irefindex");
36-
parent::addParameter('files',true,'all|10090|10116|4932|559292|562|6239|7227|9606|A','all','all or comma-separated list of files to process');
37-
parent::addParameter('version',false,'08122013|03022013|10182011','08122013','dated version of files to download');
36+
parent::addParameter('files',true,'all|10090|10116|4932|559292|562|6239|7227|9606','all','all or comma-separated list of files to process');
37+
parent::addParameter('version',false,'07042015|08122013|03022013|10182011','07042015','dated version of files to download');
3838
parent::addParameter('download_url',false,null,'http://irefindex.org/download/irefindex/data/current/psi_mitab/MITAB2.6/');
3939
parent::initialize();
4040
}
@@ -55,9 +55,8 @@ function Run()
5555

5656
foreach($files AS $file) {
5757
$download = parent::getParameterValue('download');
58-
59-
$base_file = ucfirst($file).".mitab.".parent::getParameterValue("version").".txt";
60-
$zip_file = $base_file.".zip";
58+
$version = parent::getParameterValue("version");
59+
$zip_file = ucfirst($file).".mitab.".$version.".txt.zip";
6160
$lfile = $ldir.$zip_file;
6261

6362
$gz = (strstr(parent::getParameterValue('output_format'),".gz") === FALSE)?false:true;
@@ -82,9 +81,14 @@ function Run()
8281
trigger_error("Unable to open $lfile");
8382
exit;
8483
}
84+
if($zin->numFiles != 1) {
85+
trigger_error("Found more than one file ... using first file");
86+
}
87+
$f = $zin->statIndex(0);
88+
$base_file = $f['name'];
8589
if(($fp = $zin->getStream($base_file)) === FALSE) {
86-
trigger_error("Unable to get $base_file in ziparchive $lfile");
87-
return FALSE;
90+
trigger_error("Unable to get $base_file in ziparchive $lfile");
91+
return FALSE;
8892
}
8993
parent::setReadFile($lfile);
9094
parent::getReadFile()->setFilePointer($fp);

omim/omim.php

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ function Run()
4747
if(parent::getParameterValue('omim_api_key') == '') {
4848
$key_file = parent::getParameterValue('omim_api_key_file');
4949
if(file_exists($key_file)) {
50-
$key = file_get_contents($key_file);
50+
$key = trim(file_get_contents($key_file));
5151
if($key) {
5252
parent::setParameterValue('omim_api_key', $key);
5353
} else {
@@ -199,8 +199,9 @@ function getListOfEntries($ldir)
199199
}
200200

201201
// download
202+
ftp_pasv($ftp, true);
202203
echo "Downloading $file ...";
203-
if(ftp_get($ftp, $ldir.$file, 'omim/'.$file, FTP_BINARY) === FALSE) {
204+
if(ftp_get($ftp, $ldir.$file, 'OMIM/'.$file, FTP_BINARY) === FALSE) {
204205
trigger_error("Error in downloading $file");
205206
continue;
206207
}

pharmgkb/pharmgkb.php

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -398,9 +398,16 @@ function MapXrefs($xref, &$url = false, &$ns = null, &$id = null)
398398
"refseqrna" => "refseq",
399399
"ucscgenomebrowser" => "refseq",
400400
"uniprotkb" => "uniprot",
401-
'genecard'=>'genecards'
401+
'genecard'=>'genecards',
402+
'ucsc genome browser' => 'refseq',
403+
'refseq rna' => 'refseq',
404+
'refseq protein' => 'refseq',
405+
'refseq dna' => 'refseq',
406+
'comparative toxicogenomics database' => 'ctd',
407+
'humancyc gene' => 'humancyc'
402408
);
403409
$this->getRegistry()->ParseQName($xref,$ns,$id);
410+
$ns = str_replace('"','',$ns);
404411
if(isset($xrefs[$ns])) {
405412
$ns = $xrefs[$ns];
406413
}
@@ -499,7 +506,9 @@ function drugs()
499506
$b = explode(',',trim($a[6]));
500507
foreach($b as $c) {
501508
$this->getRegistry()->parseQName($c,$ns,$id1);
502-
$ns = str_replace(array('keggcompound','keggdrug','drugbank','uniprotkb'), array('kegg','kegg','drugbank', 'uniprot'), strtolower($ns));
509+
$ns = str_replace(array('keggcompound','keggdrug','drugbank','uniprotkb','clinicaltrials.gov','drugs product database (dpd)','national drug code directory','therapeutic targets database','fda drug label at dailymed'),
510+
array('kegg','kegg','drugbank', 'uniprot','clinicaltrials','dpd','ndc','ttd','dailymed'),
511+
strtolower(str_replace('"','',$ns)));
503512
if($ns == "url") {
504513
parent::addRDF(
505514
parent::QQuadO_URL($id, "rdfs:seeAlso", $id)
@@ -728,6 +737,10 @@ function rsid()
728737
$z = 0;
729738
$this->GetReadFile()->Read();
730739
$this->GetReadFile()->Read();
740+
parent::addRDF(
741+
parent::describeClass(parent::getVoc()."Variation", "PharmGKB Variation")
742+
);
743+
731744
while($l = $this->GetReadFile()->Read()) {
732745
if($z % 10000 == 0) {
733746
parent::writeRDFBufferToWriteFile();
@@ -736,10 +749,8 @@ function rsid()
736749
$rsid = "dbsnp:".$a[0];
737750
$genes = explode(";",$a[1]);
738751
parent::addRDF(
739-
parent::describeIndividual($rsid, $rsid, parent::getVoc()."Variation").
740-
parent::describeClass(parent::getVoc()."Variation", "PharmGKB Variation")
752+
parent::describeIndividual($rsid, $rsid, parent::getVoc()."Variation")
741753
);
742-
$this->AddRDF($this->QQuad($rsid,"void:inDataset",$this->GetDatasetURI()));
743754
foreach($genes AS $gene) {
744755
parent::addRDF(
745756
parent::triplify($rsid, parent::getVoc()."gene", parent::getNamespace().$gene)

sgd/sgd.php

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ function __construct($argv) {
3838
parent::__construct($argv,"sgd");
3939
parent::addParameter('files',true,'all|dbxref|features|domains|protein|goa|goslim|complex|interaction|phenotype|pathways|mapping','all','all or comma-separated list of files to process');
4040
parent::addParameter('download_url',false,null,'http://downloads.yeastgenome.org/');
41-
parent::addParameter('ncbo_download_dir', false, null, '/data/download/ncbo/', 'directory of ncbo ontologies');
41+
parent::addParameter('ncbo_download_dir', false, null, '/data/download/bioportal/', 'directory of bioportal ontologies');
4242
parent::addParameter('ncbo_api_key',true,null,null,'your NCBO API key');
4343
parent::addParameter('one_file',false,'true|false','true',"whether to produce a single file output");
4444
parent::initialize();
@@ -639,7 +639,11 @@ function domains(){
639639
"SignalP_GRAM_POSITIVE" => "signalp",
640640
"SignalP_GRAM_NEGATIVE" => "signalp",
641641
"SignalP_EUK" => "signalp",
642-
"TMHMM" => "tmhmm"
642+
"TMHMM" => "tmhmm",
643+
"ProDom" => "prodom",
644+
"ProSiteProfiles" => "prosite",
645+
"ProSitePatterns" => "prosite",
646+
"Hamap" => "hamap"
643647
);
644648

645649
while($l = $this->GetReadFile()->Read(2048)) {

statistics/bio2rdf-individual-page.php

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@
8484
while($l = fgets($fp)) {
8585
if(!$l or $l[0] == "#" ) continue;
8686
$a = explode("\t",$l);
87+
if(!isset($a[2])) continue;
8788
if($dataset == 'all' or $dataset == trim($a[2])) $list[] = trim($a[2]);
8889
}
8990
fclose($fp);
@@ -95,7 +96,7 @@
9596
$endpoint = getEndpointInfo($dataset);
9697
$options['port'] = $endpoint['isql'];
9798

98-
$options['sparql'] = $entry['sparql'] = "http://localhost:".$endpoint['sparql']."/sparql";
99+
$options['sparql'] = $entry['sparql'] = "http://s2.semanticscience.org:".$endpoint['sparql']."/sparql";
99100

100101
$entry['target.endpoint'] = $entry['sparql'];
101102
if($options['target.endpoint']) $entry['target.endpoint'] = $options['target.endpoint'];
@@ -107,6 +108,7 @@
107108
$entry['from'] = "FROM <".$entry['graph'].">";
108109
$entry['describe'] = '';
109110
$outfile = $options['odir'].$dataset."/$dataset.html";
111+
$outfile = $options['odir']."$dataset.html";
110112
makeHTML($entry,$outfile);
111113
echo "done.".PHP_EOL;
112114
}

statistics/endpoint-statistics.php

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
*/
3030

3131
$fnx = array(
32+
// "typePropertyTypeCount"
3233
"triples",
3334
"distinctEntities",
3435
"distinctSubjects",
@@ -744,6 +745,15 @@ function addSubjectPropertyObjectCount()
744745
function addTypePropertyTypeCount()
745746
{
746747
global $options;
748+
$sparql = "SELECT ?p
749+
".$options['from-graph']."
750+
{ ?s ?p ?o FILTER (!isLiteral(?o)) }
751+
GROUP BY ?p
752+
";
753+
$r = query($sparql);
754+
foreach($r AS $c) {
755+
$p = $c->p->value;
756+
747757
$sparql = "SELECT
748758
distinct ?stype (str(?stype_label) AS ?stype_label) (?sn AS ?sn) (?dsn AS ?dsn)
749759
?p (str(?plabel) AS ?plabel)
@@ -756,6 +766,7 @@ function addTypePropertyTypeCount()
756766
?s ?p ?o .
757767
?s a ?stype .
758768
?o a ?otype .
769+
FILTER(?p = <$p>)
759770
}
760771
GROUP BY ?p ?stype ?otype
761772
}
@@ -798,6 +809,7 @@ function addTypePropertyTypeCount()
798809
Quad("http://bio2rdf.org/bio2rdf.dataset_vocabulary:Dataset-Object-Count", "http://www.w3.org/2000/01/rdf-schema#subClassOf", "http://bio2rdf.org/bio2rdf.dataset_vocabulary:Dataset-Descriptor")
799810
);
800811
}
812+
} // foreach property
801813
}
802814

803815
function addDatasetPropertyDatasetCount()

taxonomy/taxonomy.php

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ class TaxonomyParser extends Bio2RDFizer{
4848
),
4949
"file_url" => "ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdmp.zip"
5050
),
51-
"gi2taxid_protein" => array(
51+
/* "gi2taxid_protein" => array(
5252
"filename" => "gi_taxid_prot.zip",
5353
"contents" => array(
5454
"gi_taxid_prot" => "gi_taxid_prot.dmp",
@@ -62,7 +62,7 @@ class TaxonomyParser extends Bio2RDFizer{
6262
),
6363
"file_url" => "ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.zip"
6464
)
65-
);
65+
*/ );
6666

6767
function __construct($argv) {
6868
parent::__construct($argv, "taxonomy");
@@ -325,13 +325,20 @@ private function citations()
325325
continue;
326326
}
327327
$c = parent::getRes()."citation-id-".$a[0];
328+
$seealso = isset($a[4])?trim($a[4]):"";
329+
if($seealso) {
330+
$seealso = str_replace(array("lx: DOI ","http;//"), array("http://dx.doi.org/","http://"), $seealso);
331+
if(strlen($seealso) > 2 and !strstr($seealso,"http")) $seealso = "http://".$seealso;
332+
$seelalso = parent::triplify($c, "rdfs:seeAlso", $seealso);
333+
}
334+
328335
parent::addRDF(
329336
parent::describeIndividual($c, $a[1], $this->getVoc()."Citation").
330337
parent::describeClass($this->getVoc()."Citation", "Citation").
331338
parent::triplifyString($c, parent::getVoc()."citation-key", $a[1]).
332339
($a[2]=="0"?"":parent::triplify($c, parent::getVoc()."x-pubmed", "pubmed:".$a[2])).
333-
(!isset($a[4])?"":parent::triplify($c, "rdfs:seeAlso", str_replace("lx: DOI ","http://dx.doi.org/", $a[4]))).
334-
(!isset($a[5])?"":parent::triplifyString($c, parent::getVoc()."text", str_replace("\"","", $a[5])))
340+
$seealso.
341+
((isset($a[5]) and $a[5])?parent::triplifyString($c, parent::getVoc()."text", str_replace("\"","", $a[5])):"")
335342
);
336343
if(isset($a[6])) {
337344
$taxids = explode(" ", trim($a[6]));

0 commit comments

Comments
 (0)