[ Home
Page ] [ I/O Scripts Page
]
Source Code Listing of Still Other *.inc.php Files
(Parser for Still Other file formats)
Note: This is part of BioPHP 1.1 alpha code set. The code, which is
approximately 1,500+ lines long, is still rough. It also depends on two files, the alpha versions of "seqdb.inc.php" and "etc.inc.php", which I will post shortly. Improvements to the code are welcome!
=====================================================
PMD.INC.PHP - SOURCE CODE
<?php
require_once("etc.inc.php");
require_once("seqdb.inc.php");
require_once("seq.inc.php");
class Protein_PMD
{
var $entry_type;
var $entry_no;
var $mutation_type;
var $article_no;
var $authors;
var $journal;
var $medline_no;
var $title;
var $dbref; // structure unclear, implement later.
var $protein;
var $sequence;
var $source;
var $n_terminal;
var $express_sys;
var $change; // structure unclear, implement later.
var $disease; // structure unclear, implement later.
var $comment;
}
function parse_protein_pmd($flines)
{
// initialize variables here.
$auth_flag = FALSE;
$auth_string = "";
$aAuthors = array();
$jour_flag = FALSE;
$jour_string = "";
$title_flag = FALSE;
$title_string = "";
while ( list($no, $linestr) = each($flines) )
{
$linelabel = trim(left($linestr, 16));
$linedata = trim(substr($linestr, 16));
/* ENTRY data field.
Example:
ENTRY A000300 - Artificial 2607383
Assume that ENTRY data field is always one line.
Assume that ENTRY_TYPE and ENTRY_NO can be found at fixed positions in the line.
Assume that all data items are mandatory (always appear in the ENTRY line).
*/
if ($linelabel == "ENTRY")
{
$entry_type = substr($linedata,0,1);
$entry_no = substr($linedata,1,6);
$entry_tokens = preg_split("/\s+/", substr($linedata,10), -1, PREG_SPLIT_NO_EMPTY);
$mutation_type = trim($entry_tokens[0]);
$article_no = trim($entry_tokens[1]);
}
/* AUTHORS data field
Example:
AUTHORS Shoshani I., Bianchi G., Desaubry L., Dessauer C.W. &
Johnson R.A.
*/
if ($linelabel == "AUTHORS")
{
$auth_string = $linedata . " ";
$auth_flag = TRUE;
}
elseif ( (strlen(trim($linelabel)) == 0) and ($auth_flag) )
$auth_string .= $linedata . " ";
elseif ( (strlen(trim($linelabel)) > 0) and ($auth_flag) )
{
$aAuthors = preg_split("/[\,\&]/", $auth_string, -1, PREG_SPLIT_NO_EMPTY);
array_walk($aAuthors, "trim_element");
$auth_string = "";
$auth_flag = FALSE;
}
/* MEDLINE data field
Example: MEDLINE 10666322
For now, assume that it's always exactly one entry (word).
*/
if ($linelabel == "MEDLINE") $medline_no = $linedata;
/* JOURNAL data field
Example: JOURNAL Arch.Biochem.Biophys. (2000) 374(2), 389-394
For now, let's just concatenate all the lines with space. We don't extract
individual data items like journal title, publication year, etc.
*/
if ($linelabel == "JOURNAL")
{
$jour_string = $linedata . " ";
$jour_flag = TRUE;
}
elseif ( (strlen(trim($linelabel)) == 0) and ($jour_flag) )
$jour_string .= $linedata . " ";
elseif ( (strlen(trim($linelabel)) > 0) and ($jour_flag) )
{
$journal = trim($jour_string);
$jour_string = "";
$jour_flag = FALSE;
}
/* TITLE data field - handle the same way as JOURNAL.
Example:
TITLE Lys-Ala mutations of type I adenylyl cyclase result in altered
susceptibility to inhibition by adenine nucleoside
3'-polyphosphates.
*/
if ($linelabel == "TITLE")
{
$title_string = $linedata . " ";
$title_flag = TRUE;
}
elseif ( (strlen(trim($linelabel)) == 0) and ($title_flag) )
$title_string .= $linedata . " ";
elseif ( (strlen(trim($linelabel)) > 0) and ($title_flag) )
{
$title = trim($title_string);
$title_string = "";
$title_flag = FALSE;
}
if ($linelabel == "///") break;
}
$oProtein = new Protein_PMD();
$oProtein->entry_type = $entry_type;
$oProtein->entry_no = $entry_no;
$oProtein->mutation_type = $mutation_type;
$oProtein->article_no = $article_no;
$oProtein->authors = $aAuthors;
$oProtein->medline_no = $medline_no;
$oProtein->journal = $journal;
$oProtein->title = $title;
return $oProtein;
}
?>
=====================================================
PRF.INC.PHP - SOURCE CODE
<?php
// prf.inc.php
require_once("etc.inc.php");
require_once("seqdb.inc.php");
require_once("seq.inc.php");
class Protein_PRF
{
var $entry_code;
var $entry_name;
var $source;
var $journal;
var $authors;
var $title;
var $keywords;
var $comment;
var $dbref;
var $sequence;
}
function parse_protein_prf($flines)
{
// initialize variables here.
$auth_flag = FALSE;
$auth_string = "";
$aAuthors = array();
$jour_flag = FALSE;
$jour_string = "";
$title_flag = FALSE;
$title_string = "";
$comm_flag = FALSE;
$comm_string = "";
while ( list($no, $linestr) = each($flines) )
{
$linelabel = trim(left($linestr, 12));
$linedata = trim(substr($linestr, 12));
/* (ENTRY) CODE data field - one entry (word) in one line, the entry code is 6-7 digits
followed by 1-2 alpha letters.
Example: CODE 0904306A
*/
if ($linelabel == "CODE") $entry_code = $linedata;
/* (ENTRY) NAME data field - for now we only support the SUBUNIT and ISOTYPE subkeys/qualifiers,
and not the "determine" subkey/qualifier which appears in the example below.
Example:
NAME interleukin 2
determine protein
*/
if ($linelabel == "NAME") $entry_name = $linedata;
/* SOURCE data field - skip for now
Example:
SOURCE Homo sapiens
cname man
taxon Eucarya;Animalia;Metazoa;Chordata;Vertebrata;Gnathostomata;
Mammalia;Eutheria;Primates;Catarrhini;Hominidae
*/
if ($linelabel == "SOURCE")
{
}
/* JOURNAL data field
Example: JOURNAL Nature(London), 302(5906),305-310(1983)
For now, let's just concatenate all the lines with space. We don't extract
individual data items like journal title, publication year, etc.
*/
if ($linelabel == "JOURNAL")
{
$jour_string = $linedata . " ";
$jour_flag = TRUE;
}
elseif ( (strlen(trim($linelabel)) == 0) and ($jour_flag) )
$jour_string .= $linedata . " ";
elseif ( (strlen(trim($linelabel)) > 0) and ($jour_flag) )
{
$journal = trim($jour_string);
$jour_string = "";
$jour_flag = FALSE;
}
/* AUTHORS data field
Example:
AUTHOR Taniguchi,T., Matsui,H., Fujita,T., Takaoka,C., Kashima,N.,
Yoshimoto,R., Hamuro,J.
*/
if ($linelabel == "AUTHOR")
{
$auth_string = $linedata . " ";
$auth_flag = TRUE;
}
elseif ( (strlen(trim($linelabel)) == 0) and ($auth_flag) )
$auth_string .= $linedata . " ";
elseif ( (strlen(trim($linelabel)) > 0) and ($auth_flag) )
{
$temp = preg_split("/\.\,/", $auth_string, -1, PREG_SPLIT_NO_EMPTY);
array_walk($temp, "trim_element");
$last_author = array_pop($temp);
foreach($temp as $author)
$aAuthors[] = "$author.";
$aAuthors[] = $last_author;
$auth_string = "";
$auth_flag = FALSE;
}
/* TITLE data field - multiline; handle the same way as JOURNAL.
Example:
TITLE Structure and expression of a cloned cDNA for human interleukin-2.
*/
if ($linelabel == "TITLE")
{
$title_string = $linedata . " ";
$title_flag = TRUE;
}
elseif ( (strlen(trim($linelabel)) == 0) and ($title_flag) )
$title_string .= $linedata . " ";
elseif ( (strlen(trim($linelabel)) > 0) and ($title_flag) )
{
$title = trim($title_string);
$title_string = "";
$title_flag = FALSE;
}
/* KEYWORD data field
Example:
KEYWORD Interleukin 2 Human Cloning From cDNA Library
Seq Determination 812bp mRNA Hybridization Translation
Expression in Monkey Cell 153AAs T Cell Growth Factor
Stimulation of Thymidine Uptake
*/
if ($linelabel == "KEYWORD")
{
}
/* COMMENT data field
Example: COMMENT CHO.x2 hetero.x3
*/
if ($linelabel == "COMMENT")
{
$comm_string = $linedata . " ";
$comm_flag = TRUE;
}
elseif ( (strlen(trim($linelabel)) == 0) and ($comm_flag) )
$comm_string .= $linedata . " ";
elseif ( (strlen(trim($linelabel)) > 0) and ($comm_flag) )
{
$comment = trim($comm_string);
$comm_string = "";
$comm_flag = FALSE;
}
/* CROSSREF data field
Example:
CROSSREF PIR=ICHU2;PIR=ICGI2
*/
if ($linelabel == "CROSSREF")
{
}
/* SEQUENCE data field
Example:
SEQUENCE
MYRMQLLSCI ALSLALVTNS APTSSSTKKT QLQLEHLLLD LQMILNGINN YKNPKLTRML
TFKFYMPKKA TELKHLQCLE EELKPLEEVL NLAQSKNFHL RPRDLISNIN VIVLELKGSE
TTFMCEYADE TATIVEFLNR WITFCQSIIS TLT
*/
if ($linelabel == "SEQUENCE")
{
}
if ($linelabel == "///") break;
}
$oProtein = new Protein_PRF();
$oProtein->entry_code = $entry_code;
$oProtein->entry_name = $entry_name;
$oProtein->journal = $journal;
$oProtein->authors = $aAuthors;
$oProtein->title = $title;
// $oProtein->keywords = $aKeywords;
$oProtein->comment = $comment;
// $oProtein->dbref = $dbref;
// $oProtein->sequence = $sequence;
return $oProtein;
}
/*
CODE 0904306A
NAME interleukin 2
determine protein
SOURCE Homo sapiens
cname man
taxon Eucarya;Animalia;Metazoa;Chordata;Vertebrata;Gnathostomata;
Mammalia;Eutheria;Primates;Catarrhini;Hominidae
JOURNAL Nature(London), 302(5906),305-310(1983)
AUTHOR Taniguchi,T., Matsui,H., Fujita,T., Takaoka,C., Kashima,N.,
Yoshimoto,R., Hamuro,J.
TITLE Structure and expression of a cloned cDNA for human interleukin-2.
KEYWORD Interleukin 2 Human Cloning From cDNA Library
Seq Determination 812bp mRNA Hybridization Translation
Expression in Monkey Cell 153AAs T Cell Growth Factor
Stimulation of Thymidine Uptake
CROSSREF PIR=ICHU2;PIR=ICGI2
SEQUENCE
MYRMQLLSCI ALSLALVTNS APTSSSTKKT QLQLEHLLLD LQMILNGINN YKNPKLTRML
TFKFYMPKKA TELKHLQCLE EELKPLEEVL NLAQSKNFHL RPRDLISNIN VIVLELKGSE
TTFMCEYADE TATIVEFLNR WITFCQSIIS TLT
*/
?>
=====================================================
PRINTS.INC.PHP - SOURCE CODE
<?php
require_once("etc.inc.php");
require_once("seq.inc.php");
require_once("seqdb.inc.php");
class PrintsMotif
{
var $entry_name;
var $entry_type;
var $create_date;
var $upd_date;
var $desc;
}
function parse_motif_prints($flines)
{
// Initialize variables (flags and string ) here.
$desc_flag = FALSE;
$desc_string = "";
while ( list($no, $linestr) = each($flines) )
{ // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) )
$linelabel = left($linestr, 3);
$linedata = trim(substr($linestr, 4));
// GC data field - seems to contain the entry name (one word?) in exactly one line.
if ($linelabel == "gc;") $entry_name = $linedata;
// GN data field - seems to contain the entry type (> 1 word) in exactly one line.
if ($linelabel == "gn;") $entry_type = $linedata;
// GA data field - DATE CREATED and UPDATED. Assume exactly one line.
// Example: ga; 16-NOV-1995; UPDATE 06-JUN-1999
if ($linelabel == "ga;")
{
$date_tokens = preg_split("/;/", $linedata, -1, PREG_SPLIT_NO_EMPTY);
$create_date = $date_tokens[0];
array_shift($date_tokens);
foreach($date_tokens as $keyval)
{
$keyval_tokens = preg_split("/\s+/", $keyval, -1, PREG_SPLIT_NO_EMPTY);
$key = $keyval_tokens[0];
// remove the first item from array (rep. the key name), leaving the key values.
array_shift($keyval_tokens);
// rebuild the value, joining them with a whitespace character.
$val = implode(" ", $keyval_tokens);
$aEntry[$key] = $val;
}
$upd_date = $aEntry["UPDATE"];
}
// GD data field - DESCRIPTION entry - mostly multiline, connect with whitespace.
if ($linelabel == "gd;")
{
$desc_string .= $linedata . " ";
$desc_flag = TRUE;
}
elseif ($desc_flag)
{
$desc = trim($desc_string);
$desc_flag = FALSE;
$desc_string = "";
}
}
$oPrintsMotif = new PrintsMotif();
$oPrintsMotif->entry_name = $entry_name;
$oPrintsMotif->entry_type = $entry_type;
$oPrintsMotif->desc = $desc;
$oPrintsMotif->create_date = $create_date;
$oPrintsMotif->upd_date = $upd_date;
return $oPrintsMotif;
} // closes function parse_motif_prints()
?>
=====================================================
PRODOM.INC.PHP - SOURCE CODE
<?php
// prodom.inc.php
class ProtFam_Prodom
{
var $entry_no;
var $accession;
var $release;
var $domain_count;
var $freq_names; // an associative array e.g. array("FDAM" => 2, ...)
var $keywords; // a simple 1D array e.g. array("DNA-BINDING", "PROTEASE", ...)
}
function parse_protfam_prodom($flines)
{
// initialize variables here
while ( list($no, $linestr) = each($flines) )
{ // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) )
$linelabel = left($linestr, 2);
$linedata = trim(substr($linestr, 5));
$lineend = right($linedata, 1);
/* ID - IDENTIFIER data field - contains the ENTRY_NO, RELEASE, and DOMAIN_COUNT data items.
We assume here that all three data items are mandatory.
Example: ID 20167 p2002.1 10 seq.
*/
if ($linelabel == "ID")
{
// we redefine $linedata for the ID line because it starts at position index 3 instead of 5.
$linedata = trim(substr($linestr, 3));
$id_tokens = preg_split("/\s+/", $linedata, -1, PREG_SPLIT_NO_EMPTY);
// "20167", "p2002.1", "10", "seq"
$entry_no = trim($id_tokens[0]);
// we remove the prefix "p" from the second token to get the RELEASE data item.
$release = substr(trim($id_tokens[1]), 1);
// we basically ignore the fourth token, which we assume is always "seq".
$domain_count = (int) (trim($id_tokens[2]));
}
/* AC - ACCESSION data field - exactly one entry (word) in one line
Example: AC PD266930
*/
if ($linelabel == "AC") $accession = $linedata;
/* KW - KEYWORD data field
Syntax: KW [FREQUENT_NAME(OCCURRENCE)...] // KEYWORD [KEYWORD ...]
Example: KW FADR(2) Y586(1) // COMPLETE PROTEOME DNA-BINDING FATTY TRANSCRIPTION REGULATION METABOLISM REGULATOR ACID ACTIVATOR
*/
if ($linelabel == "KW")
{
$kw_tokens = preg_split("/\/\//", $linedata, -1, PREG_SPLIT_NO_EMPTY);
// E.g. $kw_tokens is "FADR(2) Y586(1)", "COMPLETE PROTEOME DNA-BINDING..."
$freqnames = trim($kw_tokens[0]);
$freqname_tokens = preg_split("/\s+/", $freqnames, -1, PREG_SPLIT_NO_EMPTY);
// E.g. $freqname_tokens is array( "FADR(2)", "Y586(1))" )
// Because we use \s+ as the separator, we are sure that each element in $freqname_tokens array
// has no trailing/leading whitespaces, so no need to array_walk(..., "trim_element") it.
$aFreqNames = array();
foreach($freqname_tokens as $seqname)
{
$seqname_tokens = preg_split("/\(/", $seqname, -1, PREG_SPLIT_NO_EMPTY);
// e.g. "FADR", "2)"
$seqname = $seqname_tokens[0];
$seqfreq = (int) (substr($seqname_tokens[1], 0, strlen($seqname_tokens[1])-1));
// we store $seqname and $seqfreq in an associative array called $aFreqNames;
$aFreqNames[$seqname] = $seqfreq;
}
$aKeywords = preg_split("/\s+/", trim($kw_tokens[1]), -1, PREG_SPLIT_NO_EMPTY);
}
if ($linelabel == "//") break;
}
$oProtFam = new ProtFam_Prodom();
$oProtFam->entry_no = $entry_no;
$oProtFam->accession = $accession;
$oProtFam->release = $release;
$oProtFam->domain_count = $domain_count;
$oProtFam->freq_names = $aFreqNames;
$oProtFam->keywords = $aKeywords;
return $oProtFam;
}
/*
ID 20167 p2002.1 10 seq.
AC PD266930
KW FADR(2) Y586(1) // COMPLETE PROTEOME DNA-BINDING FATTY TRANSCRIPTION REGULATION METABOLISM REGULATOR ACID ACTIVATOR
LA 74
ND 10
CC -!- DIAMETER: 119 PAM
CC -!- RADIUS OF GYRATION: 53 PAM
CC -!- SEQUENCE CLOSEST TO CONSENSUS: Q8ZEL9_YERPE 5-78 (distance:15 PAM)
DC This family was generated by psi-blast, with a profile built from the seed aligment of the following SCOP FAMILY
DC a.4.5.6
AL P09371|FADR_ECOLI 4 77 0.22 AQSPAGFAEEYIIESIWNNRFPPGTILPAERELSELIGVTRTTLREVLQRLARDGWLTIQHGKPTKVNNFWETS
AL Q8ZP15|Q8ZP15_SALTY 5 78 0.22 AQSPAGFAEEYIIESIWNNRFPPGTILPAERELSELIGVTRTTLREVLQRLARDGWLTIQHGKPTKVNNFWETS
AL Q8ZEL9|Q8ZEL9_YERPE 5 78 0.22 AQSPAGFAEEYIIESIWNNRFPPGSILPAERELSELIGVTRTTLREVLQRLARDGWLTIQHGKPTKVNNFWETS
AL Q8Z685|Q8Z685_SALTI 5 78 0.35 AQSPAGFAEEYIIESIWNNCFPPGTILPAERELSELIGVTRTTLREVLQRLARDGWLTIQHGKPTKVNNFWETS
AL Q9KQU8|Q9KQU8_VIBCH 5 78 0.62 AKSPAGFAEKYIIESIWNGRFPPGSILPAERELSELIGVTRTTLREVLQRLARDGWLTIQHGKPTKVNQFMETS
AL Q9CPJ0|Q9CPJ0_PASMU 10 83 0.77 AQSPAGLAEEYIVRSIWNNHFPPGSDLPAERELAEKIGVTRTTLREVLQRLARDGWLNIQHGKPTKVNNIWETS
AL P44705|FADR_HAEIN 10 81 1.08 AQSPAALAEEYIVKSIWQDVFPAGSNLPSERDLADKIGVTRTTLREVLQRLARDGWLTIQHGKPTKVNNIWD..
AL O07792|Y586_MYCTU 17 77 2.08 .........EQIATDVLTGEMPPGEALPSERRLAELLGVSRPAVREALKRLSAAGLVEVRQGDVTTVRDF....
AL Q11159|Y494_MYCTU 27 77 2.21 ...........IADAILDGVFPPGSTLPPERDLAERLGVNRTSLRQGLARLQQMGLIEVRHG............
AL Q8XFI2|Q8XFI2_SALTY 59 109 2.23 ...........IIKLINDNIFPPGTFLPPERELAKQLGVSRASLREALIVLEISGWIVIQSG............
CO AQSPAGFAEEYIVKSIWDGVFPPGSTLPPERELAERLGVSRTSLREALQRLERDGWIEIQHGKPTKVNNFWETS
DR INTERPRO; IPR000524 "Bacterial regulatory proteins, GntR"
DR PfamA; PF00392 gntR
DR PROSITE; PS00043 PDOC00042 HTH_GNTR_FAMILY (27-51)
DR PDB; 1H9T chain B (5-78) Q8ZP15_SALTY (5-78),1HW1 chain A (5-78),1HW1 chain B (5-78)
DR PDB; 1H9T chain A (5-78) Q8ZP15_SALTY (5-78)
//
*/
?>
=====================================================
REFSEQ.INC.PHP - SOURCE CODE
<?php
// refseq.inc.php
// SeqAlign() is the constructor method for the SeqAlign class. It initializes class properties.
function SeqAlign($filename = "", $format = "FASTA")
{ // OPENS function SeqAlign
if (strlen($filename) == 0)
{
$this->seq_count = 0;
$this->length = 0;
$this->seqptr = 0;
$this->gap_count = 0;
$this->is_flush = TRUE;
$this->seqset = array();
return;
}
if ($format == "FASTA")
{
$flines = file($filename);
$seqctr = 0;
$maxlen = 0;
$maxctr = 0;
$gapctr = 0;
$this->seqset = array();
$samelength = TRUE;
while ( list($no, $linestr) = each($flines) )
{ // OPENS while ( list($no, $linestr) = each($flines) )
if (substr($linestr, 0, 1) == ">")
{ // start of a new sequence
$seqctr++;
$seqlen = strlen($seqstr);
$seq_obj = new seq();
$seq_obj->id = $prev_id;
$seq_obj->length = $seqlen;
$seq_obj->sequence = $seqstr;
$seq_obj->start = $prev_start;
$seq_obj->end = $prev_end;
$localgaps = $seq_obj->symfreq("-");
$gapctr += $seq_obj->symfreq("-");
if ($seqctr > 1)
{
if ($seqlen > $maxlen) $maxlen = $seqlen;
if (($seqctr >= 3) and ($seqlen != $prev_len))
$samelength = FALSE;
array_push($this->seqset, $seq_obj);
}
$seqstr = "";
$words = preg_split("/[\>\/]/", substr($linestr, 1));
$prev_id = $words[0];
$indexes = preg_split("/-/", $words[1]);
$prev_start = $indexes[0];
$prev_end = $indexes[1];
$prev_len = $seqlen;
continue;
}
else
{
$seqstr = $seqstr . trim($linestr);
}
} // CLOSES while ( list($no, $linestr) = each($flines) )
$seqlen = strlen($seqstr);
$seq_obj = new seq();
$seq_obj->id = $prev_id;
$seq_obj->start = $prev_start;
$seq_obj->end = $prev_end;
$seq_obj->length = $seqlen;
$seq_obj->sequence = $seqstr;
$localgaps = $seq_obj->symfreq("-");
$gapctr += $seq_obj->symfreq("-");
if ($seqctr > 1)
{
if ($seqlen > $maxlen) $maxlen = $seqlen;
if (($seqctr >= 3) and ($seqlen != $prev_len))
$samelength = FALSE;
array_push($this->seqset, $seq_obj);
}
$this->seq_count = $seqctr;
$this->length = $maxlen;
$this->seqptr = 0;
$this->gap_count = $gapctr;
$this->is_flush = $samelength;
} // CLOSES if ($format == "FASTA")
elseif ($format == "CLUSTAL")
{ // OPENS elseif ($format == "CLUSTAL")
$flines = file($filename);
$namelist = array();
$conserve_line = "";
$linectr = 0;
while( list($no, $linestr) = each($flines) )
{ // OPENS while( list($no, $linestr) = each($flines) )
$linectr++;
if ($linectr == 1) continue; // skip the first line.
if (strlen(trim($linestr)) == 0) continue; // ignore blank lines.
$seqname = trim(substr($linestr, 0, 16));
$seqline = substr($linestr, 16, 60);
if (strlen(trim($seqname)) == 0)
{
$conserve_line .= substr($seqline, 0, $lastlen);
continue;
}
if (in_array($seqname, $namelist) == FALSE)
{
$namelist[] = $seqname;
$seq[$seqname] = $seqline;
$lastlen = strlen(trim($seqline));
}
else
{
$seq[$seqname] .= trim($seqline);
$lastlen = strlen(trim($seqline));
}
} // CLOSES while( list($no, $linestr) = each($flines) )
$this->seqset = array();
$gapctr = 0;
foreach($seq as $key => $value)
{
$seq_obj = new seq();
$seq_obj->id = $key;
$seq_obj->length = strlen($value);
$seq_obj->sequence = $value;
$seq_obj->start = 0;
$seq_obj->end = $seq_obj->length - 1;
$gapctr += $seq_obj->symfreq("-");
array_push($this->seqset, $seq_obj);
}
$this->seq_count = count($namelist);
$this->length = strlen($conserve_line);
$this->seqptr = 0;
$this->gap_count = $gapctr;
$this->is_flush = TRUE;
} // CLOSES elseif ($format == "CLUSTAL")
} // CLOSES function SeqAlign
?>
=====================================================
TRANSFAC.INC.PHP - SOURCE CODE
<?php
require_once("etc.inc.php");
require_once("seq.inc.php");
class TFMatrix
{
var $accession;
var $id;
var $date_created;
var $date_updated;
var $bnd_factor;
var $desc;
var $linked_factors;
var $matrix;
var $stat_basis;
var $comments;
var $ref_no;
var $ref_author;
var $ref_title;
var $ref_data;
}
// parse_tfmatrix_transfac() parses MATRIX.DAT (Transfac) and returns a TFMATRIX object containing parsed data.
function parse_tfmatrix_transfac($flines)
{
$cc_flag = FALSE;
$cc_string = "";
while ( list($no, $linestr) = each($flines) )
{ // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) )
$linelabel = left($linestr, 2);
$linedata = trim(substr($linestr, 4));
$lineend = right($linedata, 1);
// ID - IDENTIFICATION data field - one string in one line.
if ($linelabel == "ID") $id = $linedata;
// AC - ACCESSION NO data field - one string in one line.
if ($linelabel == "AC") $accession = $linedata;
/* DT - DATE data field - usually comes in two lines, the first is the
date created, and the second, the date updated. Example:
DT 20.06.90 (created); ewi.
DT 24.08.95 (updated); hiwi.
*/
if ($linelabel == "DT")
{
// assume "created", "updated" appear in lowercase at fixed position in DT line.
$type = substr($linedata,10,7);
if ($type == "created") $date_created = substr($linedata,0,8);
if ($type == "updated") $date_updated = substr($linedata,0,8);
}
// DE - DESCRIPTION data field. From sample data, it appears to be one line only.
if ($linelabel == "DE") $desc = $linedata;
/* CC - COMMENTS data field - assume to be one or more lines to be concatenated
by a whitespace character. Example:
CC Group I in [903]; 5 sites selected in vitro for binding to E12N
CC (=N-terminally truncated E12); matrix corrected according to
CC the published sequences
*/
if ($linelabel == "CC")
{
$cc_string .= $linedata . " ";
$cc_flag = TRUE;
}
elseif ($cc_flag)
{
$comments = trim($cc_string);
$cc_flag = FALSE;
}
if ($linelabel == "//") break;
} // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) )
$oTFMatrix = new TFMatrix();
$oTFMatrix->accession = $accession;
$oTFMatrix->id = $id;
$oTFMatrix->date_created = $date_created;
$oTFMatrix->date_updated = $date_updated;
$oTFMatrix->desc = $desc;
$oTFMatrix->comments = $comments;
return $oTFMatrix;
} // CLOSES parse_tfmatrix_transfac() function
class TFGene
{
var $accession;
var $id;
var $date_created;
var $date_updated;
var $desc_short;
var $desc_long;
var $organism;
var $species;
var $tax_class;
var $bucher_class;
var $tfsite_pos;
var $tfsite_accno;
var $compel_accno;
var $trrd_accno;
}
// parse_tfgene_transfac() parses GENE.DAT (Transfac) and returns a TFGENE object containing parsed data.
function parse_tfgene_transfac($flines)
{
$tax_flag = FALSE;
$tax_string = "";
$aCompel = array();
while ( list($no, $linestr) = each($flines) )
{ // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) )
$linelabel = left($linestr, 2);
$linedata = trim(substr($linestr, 4));
$lineend = right($linedata, 1);
// ID - IDENTIFICATION data field - one string in one line.
if ($linelabel == "ID") $id = $linedata;
// AC - ACCESSION NO data field - one string in one line.
if ($linelabel == "AC") $accession = $linedata;
/* DT - DATE data field - usually comes in two lines, the first is the
date created, and the second, the date updated. Example:
DT 20.06.90 (created); ewi.
DT 24.08.95 (updated); hiwi.
*/
if ($linelabel == "DT")
{
// assume "created", "updated" appear in lowercase at fixed position in DT line.
$type = substr($linedata,10,7);
if ($type == "created") $date_created = substr($linedata,0,8);
if ($type == "updated") $date_updated = substr($linedata,0,8);
}
// SD - SHORT DESCRIPTION data field. From sample data, it appears to be one line only.
if ($linelabel == "SD") $desc_short = $linedata;
// DE - LONG DESCRIPTION/GENE NAME data field. From sample data, it appears to be one line only.
if ($linelabel == "DE") $desc_long = $linedata;
// OS - ORGANISM SPECIES data field - assume to be always one line of this form (same as in class Factor):
// Syntax: OS common_name, scientific_name.
// Example: OS human, homo sapiens
// Output: $organism = "human"
// $species = "homo sapiens"
if ($linelabel == "OS")
{
$org_tokens = preg_split("/,/", $linedata, -1, PREG_SPLIT_NO_EMPTY);
array_walk($org_tokens, "trim_element");
$organism = $org_tokens[0];
$species = $org_tokens[1];
}
// OC - ORGANISM CLASSIFICATION data field - assume to be always one line of this form (same as class Factor):
// Syntax: OC kingdom; phylum; class; ...;
// Example:
// OC eukaryota; animalia; metazoa; chordata; vertebrata;
// OC tetrapoda; mammalia; eutheria; primates
// Output: $tax_class = array("eukaryota", "mammalia", ...)
// Later, convert this into an associative array. Same goes for GenBank, etc. - Serge
if ($linelabel == "OC")
{
$tax_string .= $linedata . " ";
$tax_flag = TRUE;
}
elseif ($tax_flag)
{
$tax_string = trim($tax_string);
$tax_tokens = preg_split("/;/", $tax_string, -1, PREG_SPLIT_NO_EMPTY);
array_walk($tax_tokens, "trim_element");
$tax_flag = FALSE;
}
/* CO - COMPEL ACCESSION NO data field. From data, one entry (word) in one line, multiple lines.
Example:
CO C00001
CO C00005
CO C00006
*/
if ($linelabel == "CO") $aCompel[] = $linedata;
if ($linelabel == "//") break;
} // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) )
$oTFGene = new TFGene();
$oTFGene->accession = $accession;
$oTFGene->id = $id;
$oTFGene->date_created = $date_created;
$oTFGene->date_updated = $date_updated;
$oTFGene->desc_short = $desc_short;
$oTFGene->desc_long = $desc_long;
$oTFGene->organism = $organism;
$oTFGene->species = $species;
$oTFGene->tax_class = $tax_tokens;
$oTFGene->compel_accno = $aCompel;
return $oTFGene;
} // CLOSES parse_tfgene_transfac() function
class TFClass
{
var $accession;
var $id;
var $date_created;
var $date_updated;
var $class;
var $struct_desc;
var $comments;
var $member_factors;
var $ref_no;
var $ref_author;
var $ref_title;
var $ref_data;
var $dbref;
}
// parse_tfclass_transfac() parses CLASS.DAT (Transfac) and returns a TFCLASS object containing parsed data.
function parse_tfclass_transfac($flines)
{
$class_flag = FALSE;
$class_string = "";
while ( list($no, $linestr) = each($flines) )
{ // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) )
$linelabel = left($linestr, 2);
$linedata = trim(substr($linestr, 4));
$lineend = right($linedata, 1);
// ID - IDENTIFICATION data field - one string in one line.
if ($linelabel == "ID") $id = $linedata;
// AC - ACCESSION NO data field - one string in one line.
if ($linelabel == "AC") $accession = $linedata;
/* DT - DATE data field - usually comes in two lines, the first is the
date created, and the second, the date updated. Example:
DT 20.06.90 (created); ewi.
DT 24.08.95 (updated); hiwi.
*/
if ($linelabel == "DT")
{
// assume "created", "updated" appear in lowercase at fixed position in DT line.
$type = substr($linedata,10,7);
if ($type == "created") $date_created = substr($linedata,0,8);
if ($type == "updated") $date_updated = substr($linedata,0,8);
}
// CL - CLASS data field - assume to be one or more lines, each entry separated by ;
// Example: CL zinc cluster; zinc-cysteine cluster; C6 zinc finger
// Output: ( "zinc cluster", "zinc-cystein cluster", ... )
if ($linelabel == "CL")
{
$class_string .= $linedata . " ";
$class_flag = TRUE;
}
elseif ($class_flag)
{
$class_string = trim($class_string);
if (strpos($class_string, ";") > 0)
{
$class_tokens = preg_split("/;/", $class_string);
array_walk($class_tokens, "trim_element");
// Later, look into possibility that some elements of $class_tokens array might
// contain special characters like ', \, /, etc. - Serge
}
else $class_tokens = array($class_string);
$class_flag = FALSE;
}
/* CC - COMMENTS data field - assume to be one or more lines to be concatenated
by a whitespace character. Example:
CC Zinc finger motif of GATA-type. Two such motifs are present
CC in each molecule. Each finger comprises 4 cysteine residues
CC presumably coordinating one zinc ion. However, metal chelators
CC do not suppress DNA-binding
*/
if ($linelabel == "CC")
{
$cc_string .= $linedata . " ";
$cc_flag = TRUE;
}
elseif ($cc_flag)
{
$comments = trim($cc_string);
$cc_flag = FALSE;
}
if ($linelabel == "//") break;
} // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) )
$oTFClass = new TFClass();
$oTFClass->accession = $accession;
$oTFClass->id = $id;
$oTFClass->date_created = $date_created;
$oTFClass->date_updated = $date_updated;
$oTFClass->class = $class_tokens;
$oTFClass->comments = $comments;
return $oTFClass;
} // CLOSES parse_tfclass_transfac() function
class Cell
{
var $accession;
var $id;
var $date_created;
var $date_updated;
var $author;
var $organism;
var $factor_src;
var $desc;
}
// parse_cell_transfac() parses CELL.DAT (Transfac) and returns a CELL object containing parsed data.
function parse_cell_transfac($flines)
{
$cd_flag = FALSE;
$cd_string = "";
while ( list($no, $linestr) = each($flines) )
{ // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) )
$linelabel = left($linestr, 2);
$linedata = trim(substr($linestr, 4));
$lineend = right($linedata, 1);
// ID - IDENTIFICATION data field - one string in one line.
if ($linelabel == "ID") $id = $linedata;
// AC - ACCESSION NO data field - one string in one line.
if ($linelabel == "AC") $accession = $linedata;
/* DT - DATE data field - usually comes in two lines, the first is the
date created, and the second, the date updated. Example:
DT 20.06.90 (created); ewi.
DT 24.08.95 (updated); hiwi.
*/
if ($linelabel == "DT")
{
// assume "created", "updated" appear in lowercase at fixed position in DT line.
$type = substr($linedata,10,7);
if ($type == "created") $date_created = substr($linedata,0,8);
if ($type == "updated") $date_updated = substr($linedata,0,8);
}
// OS - ORGANISM SPECIES data field - assume to be always one line of this form:
// Syntax: OS common_name
// Example: OS human
// Output: $organism = "human"
// Note: This is like the OS field in the FACTOR class minus the SPECIES (sci name).
if ($linelabel == "OS") $organism = $linedata;
// SO - FACTOR SOURCE data field. Assume to be one line.
if ($linelabel == "SO") $factor_src = $linedata;
// CD - CELL DESCRIPTION data field - may be one or more lines, to be concatenated
// with a whitespace between lines.
if ($linelabel == "CD")
{
$cd_string .= $linedata . " ";
$cd_flag = TRUE;
}
elseif ($cd_flag)
{
$cd_string = trim($cd_string);
$cd_flag = FALSE;
}
if ($linelabel == "//") break;
} // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) )
$oCell = new Cell();
$oCell->accession = $accession;
$oCell->id = $id;
$oCell->date_created = $date_created;
$oCell->date_updated = $date_updated;
$oCell->organism = $organism;
$oCell->factor_src = $factor_src;
$oCell->desc = $cd_string;
return $oCell;
} // CLOSES parse_cell_transfac() function
class Factor
{
var $accession;
var $id;
var $date_created;
var $date_updated;
var $author;
var $factor_name;
var $synonyms;
var $organism; // "organism" here refers to the common name.
var $species; // "species" is the scientific name of "organism".
var $tax_class;
var $homologs;
var $class_accno;
var $class_id;
var $class_decno;
var $length;
var $molwt;
var $sequence;
var $seq_comment;
var $features;
var $feat_struct;
var $cell_spec_pos;
var $cell_spec_neg;
var $feat_func;
var $inter_fact;
var $matrix;
var $bndsite_accno;
var $bndsite_id;
var $bndsite_quality;
var $bndsite_species;
var $ref_no;
var $ref_author;
var $ref_title;
var $ref_data;
var $dbref;
}
// parse_factor_transfac() parses FACTOR.DAT (Transfac) and returns a Site object containing parsed data.
function parse_factor_transfac($flines)
{
$desc_flag = FALSE;
$desc_string = "";
$region_flag = FALSE;
$region_string = "";
$syn_flag = FALSE;
$syn_string = "";
$homo_flag = FALSE;
$homo_string = "";
$tax_flag = FALSE;
$tax_string = "";
while ( list($no, $linestr) = each($flines) )
{ // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) )
$linelabel = left($linestr, 2);
$linedata = trim(substr($linestr, 4));
$lineend = right($linedata, 1);
// ID - IDENTIFICATION data field - one string in one line. (Same as SITE)
if ($linelabel == "ID") $id = $linedata;
// AC - ACCESSION NO data field - one string in one line. (Same as SITE)
if ($linelabel == "AC") $accession = $linedata;
/* DT - DATE data field - usually comes in two lines, the first is the
date/time created, and the second, the date updated. Example:
DT 20.06.90 11:00:03 (created); ewi.
DT 24.08.95 (updated); hiwi.
I've modified the code to allow a TIME entry after the DATE. Later, update
the code for the DT field of class SITE. - Serge
*/
if ($linelabel == "DT")
{
// Assume "(created)", "(updated)" appear after the date/time, not in fixed position.
$date_tokens = preg_split("/\s+/", $linedata, -1, PREG_SPLIT_NO_EMPTY);
if (count($date_tokens) > 3)
{
// The line contains the TIME entry as its second token.
$type = $date_tokens[2];
if ($type == "(created);") $date_created = $date_tokens[0] . " " . $date_tokens[1];
if ($type == "(updated);") $date_updated = $date_tokens[0] . " " . $date_tokens[1];
}
else
{
// The line does not contain the TIME entry as its second token.
$type = $date_tokens[1];
if ($type == "(created);") $date_created = $date_tokens[0];
if ($type == "(updated);") $date_updated = $date_tokens[0];
}
}
// FA - FACTOR NAME data field - assume one string in one line.
if ($linelabel == "FA") $factor_name = $linedata;
// SY - SYNONYMS data field - assume to be one or more lines, each entry separated by ;
// Example: SY AGP/EBP; ANF-2; CRP2; H-APF-2; IL-6DBP; LAP; LAP1; NF-IL6; NF-M;
// Output: ( "AGP/EBP", "ANF-2", ... )
if ($linelabel == "SY")
{
$syn_string .= $linedata . " ";
$syn_flag = TRUE;
}
elseif ($syn_flag)
{
$syn_string = trim($syn_string);
$syn_tokens = preg_split("/;/", $syn_string, -1, PREG_SPLIT_NO_EMPTY);
array_walk($syn_tokens, "trim_element");
// Later, look into possibility that some elements of $syn_tokens array might
// contain special characters like ', \, /, etc. - Serge
$syn_flag = FALSE;
}
// OS - ORGANISM SPECIES data field - assume to be always one line of this form:
// Syntax: OS common_name, scientific_name.
// Example: OS human, homo sapiens
// Output: $organism = "human"
// $species = "homo sapiens"
if ($linelabel == "OS")
{
$org_tokens = preg_split("/,/", $linedata, -1, PREG_SPLIT_NO_EMPTY);
array_walk($org_tokens, "trim_element");
$organism = $org_tokens[0];
$species = $org_tokens[1];
}
// OC - ORGANISM CLASSIFICATION data field - assume to be always one line of this form:
// Syntax: OC kingdom; phylum; class; ...;
// Example:
// OC eukaryota; animalia; metazoa; chordata; vertebrata;
// OC tetrapoda; mammalia; eutheria; primates
// Output: $tax_class = array("eukaryota", "mammalia", ...)
// Later, convert this into an associative array. Same goes for GenBank, etc. - Serge
if ($linelabel == "OC")
{
$tax_string .= $linedata . " ";
$tax_flag = TRUE;
}
elseif ($tax_flag)
{
$tax_string = trim($tax_string);
$tax_tokens = preg_split("/;/", $tax_string, -1, PREG_SPLIT_NO_EMPTY);
array_walk($tax_tokens, "trim_element");
$tax_flag = FALSE;
}
// HO - HOMOLOGS data field - assume to be multiple entries separated by comma,
// may span one or more lines to be concatenated by a whitespace.
if ($linelabel == "HO")
{
$homo_string .= $linedata . " ";
$homo_flag = TRUE;
}
elseif ($homo_flag)
{
$homo_tokens = preg_split("/,/", trim($homo_string), -1, PREG_SPLIT_NO_EMPTY);
array_walk($homo_tokens, "trim_element");
$homo_flag = FALSE;
}
// CL - CLASS data field. Always one line with 3 entries sep by a ;
// Example: CL C0001; CH; 2.3.3.0.1.
// Output: $class_accno = "C0001", $class_id = "CH", $class_decno = "2.3.3.0.1."
if ($linelabel == "CL")
{
$class_tokens = preg_split("/;/", $linedata, -1, PREG_SPLIT_NO_EMPTY);
array_walk($class_tokens, "trim_element");
}
if ($linelabel == "//") break;
} // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) )
$oFactor = new Factor();
$oFactor->accession = $accession;
$oFactor->id = $id;
$oFactor->date_created = $date_created;
$oFactor->date_updated = $date_updated;
$oFactor->factor_name = $factor_name;
$oFactor->synonyms = $syn_tokens;
$oFactor->organism = $organism;
$oFactor->species = $species;
$oFactor->homolog = $homo_tokens;
$oFactor->tax_class = $tax_tokens;
$oFactor->class_accno = $class_tokens[0];
$oFactor->class_id = $class_tokens[1];
$oFactor->class_decno = $class_tokens[2];
return $oFactor;
} // CLOSES parse_site_transfac() function
class Site
{
var $accession;
var $id;
var $date_created;
var $date_updated;
var $author;
var $seqtype;
var $desc;
var $gene_region;
var $regel_seq;
var $denom;
var $firstpos;
var $lastpos;
var $firstpos_def;
var $bind_factor;
var $organism;
var $tax_class;
var $factor_src;
var $method;
var $comments;
var $dbref;
var $refno;
var $ref_author;
var $ref_title;
var $ref_data;
} // closes CLASS SITE
// parse_site_transfac() parses SITE.DAT (Transfac) and returns a Site object containing parsed data.
function parse_site_transfac($flines)
{
$desc_flag = FALSE;
$desc_string = "";
$region_flag = FALSE;
$region_string = "";
while ( list($no, $linestr) = each($flines) )
{ // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) )
$linelabel = left($linestr, 2);
$linedata = trim(substr($linestr, 4));
$lineend = right($linedata, 1);
// ID - IDENTIFICATION data field - one string in one line.
if ($linelabel == "ID") $id = $linedata;
// AC - ACCESSION NO data field - one string in one line.
if ($linelabel == "AC") $accession = $linedata;
/* DT - DATE data field - usually comes in two lines, the first is the
date created, and the second, the date updated. Example:
DT 20.06.90 (created); ewi.
DT 24.08.95 (updated); hiwi.
*/
if ($linelabel == "DT")
{
// assume "created", "updated" appear in lowercase at fixed position in DT line.
$type = substr($linedata,10,7);
if ($type == "created") $date_created = substr($linedata,0,8);
if ($type == "updated") $date_updated = substr($linedata,0,8);
}
// TY - SEQUENCE TYPE data field - one string (one letter?) in one line.
// Example: TY D
if ($linelabel == "TY") $seqtype = $linedata;
// DE - DESCRIPTION data field - from sample data, it seems always one line.
// Assume may be one or more lines concatenated with a whitespace char.
if ($linelabel == "DE")
{
$desc_string .= $linedata . " ";
$desc_flag = TRUE;
}
elseif ($desc_flag)
{
$desc_string = trim($desc_string);
$desc_flag = FALSE;
}
// RE - GENE REGION data field - from sample data, it seems always one line.
// Assume may be one or more lines concatenated with a whitespace char.
// Example: RE intron promoter
if ($linelabel == "RE")
{
$region_string .= $linedata . " ";
$region_flag = TRUE;
}
elseif ($region_flag)
{
$region_string = trim($region_string);
$region_flag = FALSE;
}
// "//" - END OF RECORD MARKER
if ($linelabel == "//") break;
} // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) )
$oSite = new Site();
$oSite->accession = $accession;
$oSite->id = $id;
$oSite->date_created = $date_created;
$oSite->date_updated = $date_updated;
$oSite->seqtype = $seqtype;
$oSite->desc = $desc_string;
$oSite->gene_region = $region_string;
return $oSite;
} // CLOSES parse_site_transfac() function
?>
=====================================================
UNIGENE.INC.PHP - SOURCE CODE
<?php
//pdbstr.inc.php
require_once("etc.inc.php");
require_once("seqdb.inc.php");
require_once("seq.inc.php");
class Gene_Unigene
{
// MEMBER section
var $entry_id;
var $title;
var $seq_count;
}
function parse_gene_unigene($flines)
{
// initialize variables here
$title_flag = FALSE;
$title_string = "";
while ( list($no, $linestr) = each($flines) )
{
$linelabel = trim(substr($linestr,0,12));
$linedata = trim(substr($linestr,12));
// ID data field - from observation, one entry (word) in one line.
// Example: ID Sbi.1
if ($linelabel == "ID") $entry_id = $linedata;
/* TITLE data field - assume to be multiline.
Example:
TITLE ESTs, Moderately similar to putative pyrophosphate-fructose-6-phosphate 1-phosphotransferase [Arabidopsis thaliana] [A.thaliana]
*/
if ($linelabel == "TITLE")
{
$title_string .= $linedata . " ";
$title_flag = TRUE;
}
elseif ($title_flag)
{
$title = trim($title_string);
$title_string = "";
$title_flag = FALSE;
}
/* EXPRESS data field
Example:
EXPRESS Embryos germinated for 24 hr ; 10- to 14-day-old light-grown (greenhouse) seedlings ; Mix of ovaries of varying immature stages from 8-week-old plants ; Developing preanthesis pannicles ; Leaves
*/
if ($linelabel == "EXPRESS") {}
/* PROTSIM data field
Example:
PROTSIM ORG=Arabidopsis thaliana; PROTGI=15221156; PROTID=ref:NP_172664.1; PCT=79.41; ALN=68
*/
if ($linelabel == "PROTSIM") {}
/* SCOUNT - SEQUENCE COUNT data field
Example:
SCOUNT 12
*/
if ($linelabel == "SCOUNT") $seq_count = (int) $linedata;
if ($linelabel == "//") break;
}
$oGene = new Gene_Unigene();
$oGene->entry_id = $entry_id;
$oGene->title = $title;
$oGene->seq_count = $seq_count;
return $oGene;
}
?>
|