BioPHP: PHP for Biocomputing


[ Home Page ] [ I/O Scripts Page ]

Source Code of Other *.inc.php files
(Parser for Other File Formats)

Note: This is part of BioPHP 1.1 alpha code set. The code, which is approximately 2,000+ lines long, is still rough. It also depends on
3 files, the alpha versions of "seqdb.inc.php", "seq.inc.php", and "etc.inc.php", which I will post shortly. Improvements to the code are welcome!
=============================================== AAINDEX.INC.PHP - SOURCE CODE <?php require_once("etc.inc.php"); require_once("seq.inc.php"); class AAIndex { var $accession; var $desc; var $lit_ref; var $author; var $title; var $journal; var $comment; var $corel_accno; var $index_data; } // parse_amino_aaindex() parses entries in the AAINDEX1 file and returns // an AAINDEX object containing parsed data. function parse_amino_aaindex($flines) { $desc_flag = FALSE; $desc_string = ""; $aRefs = array(); while ( list($no, $linestr) = each($flines) ) { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) $linelabel = left($linestr, 1); $linedata = trim(substr($linestr, 2)); // H - (HEADER?) ACCESSION NO data field - one string (word) in one line. if ($linelabel == "H") $accession = $linedata; // D - DESCRIPTION data field. Multilines connected with " ". Next lines // have no "D" at start (like GenBank style, unlike Swissprot's). if ($linelabel == "D") { $desc_string = $linedata . " "; $desc_flag = TRUE; } elseif (($linelabel == " ") and ($desc_flag)) $desc_string .= $linedata . " "; elseif ( ($linelabel != " ") and ($desc_flag) ) { $desc = trim($desc_string); $desc_flag = FALSE; } /* R - (REFERENCES?) LITDB ENTRY NO, PMID AND OTHER REFERENCES data field From sample data, it appears to be only one line, of the form: Syntax: R DBNAME1:ID_NO1 DBNAME2:ID_NO2 ... Example: R LIT:1810048b PMID:1575719 NOTE: There are "blank or empty R lines", i.e. R following by nothing. */ if ( ($linelabel == "R") and (strlen(trim($linedata)) > 0) ) { $ref_tokens = preg_split("/\s+/", $linedata); if (count($ref_tokens) == 0) $ref_tokens = array($linedata); foreach($ref_tokens as $ref_item) { $item_tokens = preg_split("/\:/", $ref_item, -1, PREG_SPLIT_NO_EMPTY); $dbname = $item_tokens[0]; $entry_no = $item_tokens[1]; $aRefs[$dbname] = $entry_no; } } if ($linelabel == "//") break; } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) ) $oAAIndex = new AAIndex(); $oAAIndex->accession = $accession; $oAAIndex->desc = $desc; $oAAIndex->lit_ref = $aRefs; return $oAAIndex; } // CLOSES parse_amino_aaindex_transfac() function ?> =============================================== BLOCKS.INC.PHP - SOURCE CODE <?php // blocks.inc.php class ProtFam_Blocks { var $id; var $accession; var $dist_min; var $dist_max; var $desc; var $aa_triplet; } function parse_protfam_blocks($flines) { // Initialize variables here. $desc_flag = FALSE; $desc_string = ""; while ( list($no, $linestr) = each($flines) ) { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 5)); $lineend = right($linedata, 1); if ($linelabel == "ID") { $id_tokens = preg_split("/;/", $linedata, -1, PREG_SPLIT_NO_EMPTY); $id = trim($id_tokens[0]); } /* AC - ACCESSION no and related info data field. Syntax: AC <accession no>; distance from previous block=(min,max) Example: AC IPB002128C; distance from previous block=(30,31) */ if ($linelabel == "AC") { $ac_tokens = preg_split("/;/", $linedata, -1, PREG_SPLIT_NO_EMPTY); $accession = trim($ac_tokens[0]); $distance_string = trim($ac_tokens[1]); $dist_tokens = preg_split("/\(/", $distance_string, -1, PREG_SPLIT_NO_EMPTY); // "distance from previous block=", "min,max)" $minmax_tokens = preg_split("/,/", $dist_tokens[1], -1, PREG_SPLIT_NO_EMPTY); // "min", "max)" $dist_min = (int) $minmax_tokens[0]; // we remove the ")" at the end of "max)" $dist_max = (int) (substr($minmax_tokens[1], 0, strlen($minmax_tokens[1])-1)); } if ($linelabel == "DE") { $desc_string .= $linedata . " "; $desc_flag = TRUE; } elseif ($desc_flag) { $desc = trim($desc_string); $desc_flag = FALSE; } /* BL - BLOCKS data field Example: BL RDG; width=55; seqs=158; 99.5%=1918; strength=2332 */ if ($linelabel == "BL") { $bl_tokens = preg_split("/;/", $linedata, -1, PREG_SPLIT_NO_EMPTY); $aa_triplet = trim($bl_tokens[0]); } if ($linelabel == "//") break; } $oProtFam = new ProtFam_Blocks(); $oProtFam->id = $id; $oProtFam->accession = $accession; $oProtFam->desc = $desc; $oProtFam->dist_min = $dist_min; $oProtFam->dist_max = $dist_max; $oProtFam->aa_triplet = $aa_triplet; return $oProtFam; } ?> =============================================== EMBL.INC.PHP - SOURCE CODE <?php require_once("etc.inc.php"); require_once("seq.inc.php"); require_once("seqdb.inc.php"); class EmblSeq { var $entry_name; var $moltype; var $data_class; var $length; var $accession; var $create_date; var $create_rel; var $sequpd_date; var $sequpd_rel; var $notupd_date; var $notupd_rel; var $desc; } // parse_na_embl() parses an EMBL DNA data file and returns an EmblSeq object // containing parsed data. function parse_na_embl($flines) { $accession = array(); $date_r = array(); $desc = ""; $desc_lnctr = 0; $gename_r = array(); $os_r = array(); $os_linectr = 0; $os_str = ""; $oc_linectr = 0; $oc_str = ""; $ref_r = array(); $ra_r = array(); $ra_ctr = 0; $ra_str = ""; $rl_ctr = 0; $rl_str = ""; $db_r = array(); $ft_r = array(); $kw_str = ""; $kw_r = array(); $cc_string = ""; $in_cc_flag = FALSE; $aComments = array(); while ( list($no, $linestr) = each($flines) ) { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 5)); $lineend = right($linedata, 1); // May 20, 2003: Added this IF statement to handle CC (COMMENT) lines. // CC - COMMENTS data field. Freetext. Entries may be subdivided into TOPICS. // For now, ignore topics and just assume it's one long string. // I placed this at the TOP (ahead of REFERENCES or RN section) to avoid // complications brought about by the call to PREV() inside RN. if ($linelabel == "CC") { if (left($linedata,3) == '-!-') { // START OF A COMMENT BLOCK if (strlen(trim($cc_string)) > 0) { // There is a previous comment block that needs to be "saved". $aComments[] = $cc_string; } $cc_string = ""; $cc_string .= $linedata . " "; } else $cc_string .= $linedata . " "; $in_cc_flag = TRUE; } elseif ($in_cc_flag) { // automatically assume that $aComments contains something already. $aComments[] = trim($cc_string); $cc_string = ""; $in_cc_flag = FALSE; } // ID - IDENTIFICATION data field. // Example: ID AF039870 standard; DNA; GSS; 526 BP. if (left($linestr, 2) == "ID") { // OPENS if (left($linestr, 2) == "ID") $words = preg_split("/;/", substr($linestr, 5)); // May 20, 2003: Changed \s to [\s]+ below. $endc = preg_split("/[\s]+/", $words[0]); $entry_name = $endc[0]; // May 20, 2003: Added the -1 and PREG_SPLIT_NO_EMPTY arguments below. $namesrc = preg_split("/_/", $entry_name, -1, PREG_SPLIT_NO_EMPTY); $na_name = $namesrc[0]; $na_source = $namesrc[1]; $data_class = $endc[1]; // May 20, 2003: Enclosed $words[1] within a trim() function. $moltype = trim($words[1]); $bp_tokens = preg_split("/\s+/", $words[3], -1, PREG_SPLIT_NO_EMPTY); $length = (int) (trim($bp_tokens[0])); } // CLOSES if (left($linestr, 2) == "ID") // AC - ACCESSION data field. if (left($linestr, 2) == "AC") { $accstr = $linedata; // May 20, 2003: Commented out the line below. We will not remove // the ; at the end of an AC line. Instead, we use PREG_SPLIT_NO_EMPTY. // $accstr = substr($accstr, 0, strlen($accstr)-1); // May 20, 2003: Added the -1, PREG_SPLIT_NO_EMPTY arguments below. // $accline = preg_split("/;/", intrim($accstr); $accline = preg_split("/;/", $accstr, -1, PREG_SPLIT_NO_EMPTY); $accession = array_merge($accession, $accline); } // DT - DATE (of entry) data field. Similar to Swissprot. if (left($linestr, 2) == "DT") { // OPENS if (left($linestr, 2) == "DT") // DT DD-MMM-YEAR (REL. XX, COMMENT) $datestr = $linedata; $datestr = substr($datestr, 0, strlen($datestr)-1); $words = preg_split("/\(/", $datestr); // ( "DD-MMM-YEAR ", "REL. XX, COMMENT") $firstcomma = strpos($words[1], ","); // May 20, 2003: Converted $comment below into uppercase. $comment = strtoupper(trim(substr($words[1], $firstcomma+1))); // ( "CREATED" => (date, rel), "LAST SEQUENCE UPDATE" => (date, rel), // "LAST ANNOTATION UPDATE" => (date, rel), COMMENT1 => (date, rel), // "COMMENT2" => (date, rel), ... ) if ($comment == "CREATED") { // this DT line is a DATE CREATED line. $create_date = substr($words[0], 0, 11); $create_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($create_date, $create_rel); } // NOTE 1: Edited this ELSEIF line. See notes at file end. - Serge // elseif ($comment == "LAST SEQUENCE UPDATE") elseif ( is_integer(strpos($comment, "LAST UPDATED")) ) { // this DT line represents LAST SEQUENCE UPDATE $sequpd_date = substr($words[0], 0, 11); $sequpd_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($sequpd_date, $sequpd_rel); } } // CLOSES if (left($linestr, 2) == "DT") // SV - SEQUENCE VERSION data field - exactly one per entry. if ($linelabel == "SV") $seq_ver = $linedata; // DE - DESCRIPTION data field. May be one or more lines. Concatenate and store as one string. // Keyword (FRAGMENT) or (FRAGMENTS) may be found at the end of this string. if (left($linestr, 2) == "DE") { // OPENS if (left($linestr, 2) == "DE") $desc_lnctr++; $linestr = $linedata; if ($desc_lnctr == 1) $desc .= $linestr; else $desc .= " " . $linestr; } // CLOSES if (left($linestr, 2) == "DE") if ($linelabel == "//") break; } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) ) $oEmblSeq = new EmblSeq(); $oEmblSeq->entry_name = $na_name; $oEmblSeq->data_class = $data_class; $oEmblSeq->moltype = $moltype; $oEmblSeq->length = $length; $oEmblSeq->seq_ver = $seq_ver; $oEmblSeq->accession = $accession; $oEmblSeq->create_date = $create_date; $oEmblSeq->create_rel = $create_rel; $oEmblSeq->sequpd_date = $sequpd_date; $oEmblSeq->sequpd_rel = $sequpd_rel; $oEmblSeq->notupd_date = $notupd_date; $oEmblSeq->notupd_rel = $notupd_rel; $oEmblSeq->desc = $desc; /* $seqobj->id = $protein_name; $seqobj->seqlength = $length; $seqobj->moltype = $moltype; $seqobj->date = $create_date; $seqobj->accession = $accession[0]; $seqobj->source = $os_line; $seqobj->organism = $oc_line; $seqobj->sequence = $sequence; $seqobj->definition = $desc; */ // FT_<keyword> is an ARRAY. // process_ft($swiss, $ft_r); return $oEmblSeq; } // CLOSES parse_na_embl() ?> =============================================== EPD.INC.PHP - SOURCE CODE <?php require_once("etc.inc.php"); require_once("seq.inc.php"); require_once("seqdb.inc.php"); class Promoter { var $entry_name; var $data_type; var $insite_type; var $tax_div; var $accession; var $create_date; var $create_rel; var $sequpd_date; var $sequpd_rel; var $notupd_date; var $notupd_rel; } // parse_promoter_epd() parses an EPD data file and returns a Promoter object // containing parsed data. function parse_promoter_epd($flines) { $accession = array(); $date_r = array(); $desc = ""; $desc_lnctr = 0; $gename_r = array(); $os_r = array(); $os_linectr = 0; $os_str = ""; $oc_linectr = 0; $oc_str = ""; $ref_r = array(); $ra_r = array(); $ra_ctr = 0; $ra_str = ""; $rl_ctr = 0; $rl_str = ""; $db_r = array(); $ft_r = array(); $kw_str = ""; $kw_r = array(); $cc_string = ""; $in_cc_flag = FALSE; $aComments = array(); while ( list($no, $linestr) = each($flines) ) { // OPENS 1st (outermost) while ( list($no, $linestr) = each($flines) ) $linelabel = left($linestr, 2); $linedata = trim(substr($linestr, 5)); $lineend = right($linedata, 1); // May 20, 2003: Added this IF statement to handle CC (COMMENT) lines. // CC - COMMENTS data field. Freetext. Entries may be subdivided into TOPICS. // For now, ignore topics and just assume it's one long string. // I placed this at the TOP (ahead of REFERENCES or RN section) to avoid // complications brought about by the call to PREV() inside RN. if ($linelabel == "CC") { if (left($linedata,3) == '-!-') { // START OF A COMMENT BLOCK if (strlen(trim($cc_string)) > 0) { // There is a previous comment block that needs to be "saved". $aComments[] = $cc_string; } $cc_string = ""; $cc_string .= $linedata . " "; } else $cc_string .= $linedata . " "; $in_cc_flag = TRUE; } elseif ($in_cc_flag) { // automatically assume that $aComments contains something already. $aComments[] = trim($cc_string); $cc_string = ""; $in_cc_flag = FALSE; } // ID - IDENTIFICATION data field. if (left($linestr, 2) == "ID") { $words = preg_split("/;/", substr($linestr, 5)); // May 20, 2003: Changed \s to [\s]+ below. $endc = preg_split("/[\s]+/", $words[0]); $entry_name = $endc[0]; $data_class = $endc[1]; $insite_type = trim($words[1]); $tax_div = trim($words[2]); if (right($tax_div,1) == ".") $tax_div = substr($tax_div, 0, strlen($tax_div)-1); } // AC - ACCESSION data field. if (left($linestr, 2) == "AC") { $accstr = $linedata; // May 20, 2003: Commented out the line below. We will not remove // the ; at the end of an AC line. Instead, we use PREG_SPLIT_NO_EMPTY. // $accstr = substr($accstr, 0, strlen($accstr)-1); // May 20, 2003: Added the -1, PREG_SPLIT_NO_EMPTY arguments below. // $accline = preg_split("/;/", intrim($accstr); $accline = preg_split("/;/", $accstr, -1, PREG_SPLIT_NO_EMPTY); $accession = array_merge($accession, $accline); } // DT - DATE (of entry) data field. Similar to Swissprot. if (left($linestr, 2) == "DT") { // OPENS if (left($linestr, 2) == "DT") // DT DD-MMM-YEAR (REL. XX, COMMENT) $datestr = $linedata; $datestr = substr($datestr, 0, strlen($datestr)-1); $words = preg_split("/\(/", $datestr); // ( "DD-MMM-YEAR ", "REL. XX, COMMENT") $firstcomma = strpos($words[1], ","); // May 20, 2003: Converted $comment below into uppercase. $comment = strtoupper(trim(substr($words[1], $firstcomma+1))); // ( "CREATED" => (date, rel), "LAST SEQUENCE UPDATE" => (date, rel), // "LAST ANNOTATION UPDATE" => (date, rel), COMMENT1 => (date, rel), // "COMMENT2" => (date, rel), ... ) if ($comment == "CREATED") { // this DT line is a DATE CREATED line. $create_date = substr($words[0], 0, 11); $create_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($create_date, $create_rel); } // NOTE 1: Edited this ELSEIF line. See notes at file end. - Serge // elseif ($comment == "LAST SEQUENCE UPDATE") elseif ( is_integer(strpos($comment, "LAST SEQUENCE UPDATE")) ) { // this DT line represents LAST SEQUENCE UPDATE $sequpd_date = substr($words[0], 0, 11); $sequpd_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($sequpd_date, $sequpd_rel); } // NOTE 1: Edited this ELSEIF line. See notes at file end. - Serge // elseif ($comment == "LAST ANNOTATION UPDATE") elseif ( is_integer(strpos($comment, "LAST ANNOTATION UPDATE")) ) { // this DT line represents LAST ANNOTATION UPDATE $notupd_date = substr($words[0], 0, 11); $notupd_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($notupd_date, $notupd_rel); } else { // For now, we do not check vs. duplicate comments. // We just overwrite the older comment with new one. $other_comment = $comment; $other_date = substr($words[0], 0, 11); $other_rel = substr($words[1], 5, ($firstcomma-5)); $date_r[$comment] = array($other_date, $other_rel); } } // CLOSES if (left($linestr, 2) == "DT") // DE - DESCRIPTION data field. May be one or more lines. Concatenate and store as one string. // Keyword (FRAGMENT) or (FRAGMENTS) may be found at the end of this string. if (left($linestr, 2) == "DE") { // OPENS if (left($linestr, 2) == "DE") $desc_lnctr++; $linestr = $linedata; if ($desc_lnctr == 1) $desc .= $linestr; else $desc .= " " . $linestr; // Checks if (FRAGMENT) or (FRAGMENTS) is found at the end // of the DE line to determine if sequence is complete. if (right($linestr, 1) == ".") { // OPENS if (right($linestr, 1) == ".") if ( (strtoupper(right($linestr, 11)) == "(FRAGMENT).") or (strtoupper(right($linestr, 12)) == "(FRAGMENTS).") ) $is_fragment = TRUE; else $is_fragment = FALSE; } // CLOSE if (right($linestr, 1) == ".") } // CLOSES if (left($linestr, 2) == "DE") } // CLOSES 1st (outermost) while ( list($no, $linestr) = each($flines) ) $oPromoter = new Promoter(); $oPromoter->entry_name = $entry_name; $oPromoter->data_class = $data_class; $oPromoter->insite_type = $insite_type; $oPromoter->tax_div = $tax_div; $oPromoter->accession = $accession; $oPromoter->create_date = $create_date; $oPromoter->create_rel = $create_rel; $oPromoter->sequpd_date = $sequpd_date; $oPromoter->sequpd_rel = $sequpd_rel; $oPromoter->notupd_date = $notupd_date; $oPromoter->notupd_rel = $notupd_rel; /* $seqobj->id = $protein_name; $seqobj->seqlength = $length; $seqobj->moltype = $moltype; $seqobj->date = $create_date; $seqobj->accession = $accession[0]; $seqobj->source = $os_line; $seqobj->organism = $oc_line; $seqobj->sequence = $sequence; $seqobj->definition = $desc; */ // FT_<keyword> is an ARRAY. // process_ft($swiss, $ft_r); return $oPromoter; } // CLOSES parse_promoter_epd() /* NOTES: NOTE 1: June 3, 2003 - I noticed that an EPD entry may have a period (.) at the end while Swissprot entries do not have one. This causes my Swissprot code to run into problems because it uses an exact match of keyphrase (LAST SEQ...) at a particular position within the DT line. I'm changing exact match to a substring containment test. */ ?> =============================================== EXPASY.INC.PHP - SOURCE CODE <?php class Enzyme_Expasy { var $id; var $desc; var $alt_names; var $cofactors; var $comments; var $diseases; var $prosite_refs; var $swp_refs; } function parse_enzyme_expasy($flines) { // initialize variables $desc_flag = FALSE; $desc_string = ""; $an_flag = FALSE; $aAltNames = array(); $ca_flag = FALSE; $ca_string = ""; $cf_flag = FALSE; $cf_string = ""; $cc_string = ""; $aDiseases = array(); $aPrositeRefs = array(); $dr_flag = FALSE; $dr_string = ""; $aDRs = array(); while ( list($no, $linestr) = each($flines) ) { $linelabel = substr($linestr,0,2); $linedata = trim(substr($linestr,5)); /* ID - IDENTIFIER data field Example: ID 1.1.1.2 */ if ($linelabel == "ID") $id = $linedata; /* DE - DESCRIPTION data field Example: DE Alcohol dehydrogenase (NADP+). */ if ($linelabel == "DE") { $desc_string .= $linedata . " "; $desc_flag = TRUE; } elseif ($desc_flag) { $desc = trim($desc_string); $desc_string = ""; $desc_flag = FALSE; } /* AN - ALTERNATE NAME(S) data field Example: AN Aldehyde reductase (NADPH). */ if ($linelabel == "AN") { // we remove the last character from each AN line, which is always a period (.). $aAltNames[] = substr($linedata, 0, strlen($linedata)-1); $an_flag = TRUE; } elseif ($an_flag) $an_flag = FALSE; /* CA - CATALYTIC ACTIVITY data field - (>=0 per entry) Example: CA Propane-1,2-diol 1-phosphate + NAD(+) = hydroxyacetone phosphate + CA NADH. */ if ($linelabel == "CA") { $ca_string .= $linedata . " "; $ca_flag = TRUE; } elseif ($ca_flag) { $ca_string = trim($ca_string); // we move the last character which is always a period (.) $cat_activity = substr($ca_string, 0, strlen($ca_string)-1); $ca_string = ""; $ca_flag = FALSE; } /* CF - COFACTORS data field - >= 0 per entry. Example: CF Potassium or Ammonia; Manganese or Cobalt. */ if ($linelabel == "CF") { $cf_string .= $linedata . " "; $cf_flag = TRUE; } elseif ($cf_flag) { $aCofactors = preg_split("/;/", trim($cf_string), -1, PREG_SPLIT_NO_EMPTY); array_walk($aCofactors, "trim_element"); $lastindex = count($aCofactors)-1; $last_cf = $aCofactors[$lastindex]; $aCofactors[$lastindex] = substr( $last_cf, 0, strlen($last_cf)-1 ); $cf_string = ""; $cf_flag = FALSE; } /* CC - COMMENTS data field - for now, let's ignore -!- (COMMENT BLOCKS) and treat the whole thing as one long text. Also, we will retain the given line division. Example: CC -!- Some members of this group oxidize only primary alcohols; others act CC also on secondary alcohols. CC -!- May be identical with EC 1.1.1.19, EC 1.1.1.33 and EC 1.1.1.55. CC -!- A-specific with respect to NADPH. */ if ($linelabel == "CC") $cc_string .= substr($linestr,5) . "\n"; /* DI - DISEASES (associated with enzyme) data field - assume one compound entry per line, multiline. Example: DI 6-phosphogluconate dehydrogenase deficiency; MIM: 172200. */ if ($linelabel == "DI") { $di_tokens = preg_split("/;/", $linedata, -1, PREG_SPLIT_NO_EMPTY); $disease = trim($di_tokens[0]); $lit_str = trim($di_tokens[1]); $lit_tokens = preg_split("/\:/", $lit_str, -1, PREG_SPLIT_NO_EMPTY); $lit_ref = trim($lit_tokens[1]); // remove the last character which is always a period (.) $lit_ref = substr( $lit_ref, 0, strlen($lit_ref)-1 ); $aDiseases[] = array($lit_ref, $disease); } /* PR - PROSITE CROSS-REFERENCES data field Example: PR PROSITE; PDOC00058; PR PROSITE; PDOC00059; PR PROSITE; PDOC00060; */ if ($linelabel == "PR") { $pr_tokens = preg_split("/;/", $linedata, -1, PREG_SPLIT_NO_EMPTY); $pr_id = trim($pr_tokens[1]); $aPrositeRefs[] = $pr_id; } /* DR - Swissprot Database References data field Example: DR P35630, ADH1_ENTHI; Q24857, ADH3_ENTHI; O57380, ADH4_RANPE; DR P25984, ADH_CLOBE ; P75214, ADH_MYCPN ; P31975, ADH_MYCTU ; DR P27800, ALDX_SPOSA; */ if ($linelabel == "DR") { $dr_string .= $linedata; $dr_flag = TRUE; } elseif ($dr_flag) { $dr_tokens = preg_split("/;/", trim($dr_string), -1, PREG_SPLIT_NO_EMPTY); foreach($dr_tokens as $dr_entry) { $dr_entry_tokens = preg_split("/\,/", $dr_entry, -1, PREG_SPLIT_NO_EMPTY); $swp_pacc = trim($dr_entry_tokens[0]); $swp_name = trim($dr_entry_tokens[1]); $aDRs[$swp_pacc] = $swp_name; } // reset the values of string accumulator and flag for the DR data field $dr_string = ""; $dr_flag = FALSE; } if ($linelabel == "//") break; } $oEnzyme = new Enzyme_Expasy(); $oEnzyme->id = $id; $oEnzyme->desc = $desc; $oEnzyme->alt_names = $aAltNames; $oEnzyme->cat_activity = $cat_activity; $oEnzyme->cofactors = $aCofactors; $oEnzyme->comments = $cc_string; $oEnzyme->diseases = $aDiseases; $oEnzyme->prosite_refs = $aPrositeRefs; $oEnzyme->swp_refs = $aDRs; return $oEnzyme; } /* AN Alternate name(s) (>=0 per entry) CA Catalytic activity (>=0 per entry) CF Cofactor(s) (>=0 per entry) CC Comments (>=0 per entry) DI Disease(s) associated with the enzyme (>=0 per entry) PR Cross-references to PROSITE (>=0 per entry) DR Cross-references to SWISS-PROT (>=0 per entry) ID 1.1.1.2 DE Alcohol dehydrogenase (NADP+). AN Aldehyde reductase (NADPH). CA An alcohol + NADP(+) = an aldehyde + NADPH. CF Zinc. CC -!- Some members of this group oxidize only primary alcohols; others act CC also on secondary alcohols. CC -!- May be identical with EC 1.1.1.19, EC 1.1.1.33 and EC 1.1.1.55. CC -!- A-specific with respect to NADPH. PR PROSITE; PDOC00061; DR P35630, ADH1_ENTHI; Q24857, ADH3_ENTHI; O57380, ADH4_RANPE; */ ?> =============================================== GENOME.INC.PHP - SOURCE CODE <?php require_once("seqdb.inc"); require_once("etc.inc"); /* genome.inc - include file containing the Genome class which has one parser method Description: Contains the definition of the Genome class plus some helper functions. Author: Serge Gregorio, Jr. Date: May 4, 2003 License: General Public License 2.0 This code has been written as part of the GenePHP/BioPHP project, located at: http://genephp.sourceforge.net */ class genome { var $organism; // scientific name of source organism var $common_name; // common name of source organism var $tax_class; // taxonomic classification of source organism (excl. scientific name) var $is_complete; // YES/NO indicating if genome has been completely sequenced var $gb_release; // GenBank release used for stats below var $gb_entries; // number of entries in GenBank var $gb_basepairs; // number of basepairs in Genbank var $size; // haploid size in basepairs var $reference; /* an array of REFERENCE SETs (which is itself an array); a single GENOME record/object may contain one or more REFERENCE ENTRIES. A reference set may hold the ff. info: address, title, journal, volume, pages, year. */ } // closes GENOME class // $files is an array of name of all the files to be parsed. function parseall_genome_dogs($files) { $aoGenomes = array(); foreach($files as $fname) { $fp = fopen($fname, "r"); if ($fp == FALSE) die("Cannot open $fname!"); $flines = array(); while(1) { $linestr = fgets($fp, 101); if (feof($fp) == TRUE) break; $flines[] = $linestr; if (left($linestr,2) == '//') { $aoGenomes[] = parse_genome_dogs($flines); $flines = array(); } } fclose($fp); } return $aoGenomes; // return FALSE; } function parse_genome_dogs($flines) { // opens function parse_genome_dogs() $entry_ctr = 0; $in_record_flag = FALSE; // Initialize values for the CLASSIFICATION field. $in_tax_flag = FALSE; $tax_string = ""; $tax_class = array(); // Initialize values for REFERENCE section. $in_ref_flag = FALSE; $reference = array(); $curr_ref = array(); $ref_types = array(); // Initialize values for AUTHOR subsection within REFERENCE section. $author_string = ""; $author_array = array(); // Initialize values for TITLE subsection within REFERENCE section. $title_string = ""; $title_array = array(); while( list($lineno, $linestr) = each($flines) ) { // OPENS outermost while() loop // if record hasn't begun and we haven't found "ORGANISM" tag yet, skip to next line. if ( ($in_record_flag == FALSE) and (left($linestr,8) != "ORGANISM") ) continue; else $in_record_flag = TRUE; // print $lineno . " : " . $linestr; // print "<BR>"; if (left($linestr,8) == "ORGANISM") { $entry_ctr++; $organism = trim(substr($linestr,8)); // print "ORGANISM (INSIDE PARSE): "; // print $organism; // print "<BR>"; } if (left($linestr,11) == "COMMON_NAME") { $common_name = trim(substr($linestr,11)); } if (left($linestr,14) == "CLASSIFICATION") { // Start (first line) of CLASSIFICATION field. $tax_string .= trim(substr($linestr,14)) . " "; $in_tax_flag = TRUE; } elseif ( (left($linestr,9) != "COMPLETED") and ($in_tax_flag) ) { // at the 2nd, 3rd, etc. line of the CLASSIFICATION field. $tax_string .= trim($linestr) . " "; } if (left($linestr,9) == "COMPLETED") { // print $tax_string; // print "<BR>"; // assume that COMPLETED field ALWAYS follows CLASSIFICATION field. // tax class field has ended, convert $tax_string into an array. $tax_class = preg_split("/;/", $tax_string, -1, PREG_SPLIT_NO_EMPTY); array_walk($tax_class, "trim_element"); $tax_string = ""; $in_tax_flag = FALSE; // assume anything other than "yes" is a "no". $completed = strtoupper(trim(substr($linestr,9))); $is_complete = ($completed == "YES" ? TRUE : FALSE); } if (left($linestr,10) == "GB_RELEASE") { $gb_release = trim(substr($linestr,10)); } if (left($linestr,10) == "GB_ENTRIES") { $gb_entries = (int) (trim(substr($linestr,10))); } if (left($linestr,12) == "GB_BASEPAIRS") { // check if need to convert to (int) or (float) $gb_bps = trim(substr($linestr,12)); } if (left($linestr,11) == "GENOME_SIZE") { // check if need to convert to (int) or (float) $size = trim(substr($linestr,11)); } if (left($linestr,8) == "REF_TYPE") { // if (count($curr_ref) > 0) if ($in_ref_flag == TRUE) { // Add the previous reference set ($curr_ref array) to the big $references array. $reference[] = $curr_ref; $in_ref_flag == TRUE; // Create $ref_type array and add to the $curr_ref array. $ref_type = trim(substr($linestr,8)); $ref_type = preg_split("/\s/", $ref_type, -1, PREG_SPLIT_NO_EMPTY); // initialize $reference array to contain one element, the $ref_type array. $curr_ref = array(); $curr_ref["TYPE"] = $ref_type; continue; } else { $ref_type = trim(substr($linestr,8)); $ref_type = preg_split("/\s/", $ref_type, -1, PREG_SPLIT_NO_EMPTY); // initialize $reference array to contain one element, the $ref_type array. $curr_ref = array(); $curr_ref["TYPE"] = $ref_type; $in_ref_flag = TRUE; continue; } } if ($in_ref_flag) { // opens if ($in_ref_flag) // Set of IF statements to handle REF_AUTHOR section. // print "(" . trim(left($linestr,11)) . ")"; // print "<BR>"; if (left($linestr,10) == "REF_AUTHOR") { $author_string .= trim(substr($linestr,10)) . " "; $in_author_flag = TRUE; } elseif ( (left($linestr,1) == "\t") and ($in_author_flag) ) { // we are at the 2nd, 3rd, etc. lines of the REF_AUTHOR section. $author_string .= trim($linestr) . " "; } elseif ( (left($linestr,1) != "\t") and ($in_author_flag) ) { // author section has ended, make author string into array. $author_array = preg_split("/,/", $author_string, -1, PREG_SPLIT_NO_EMPTY); // The next two lines can be replaced with an array_walk(). // $author_trimmed = array(); // foreach($author_array as $author) $author_trimmed[] = trim($author); array_walk($author_array, "trim_element"); $curr_ref["AUTHOR"] = $author_array; $author_string = ""; $in_author_flag = FALSE; } // Set of IF statements to handle REF_TITLE section. if (left($linestr,9) == "REF_TITLE") { // assume there is only one title in one or more lines. $title_string .= trim(substr($linestr,9)) . " "; $in_title_flag = TRUE; } elseif ( (left($linestr,1) == "\t") and ($in_title_flag) ) { // we are at the 2nd, 3rd, etc. lines of the REF_TITLE section. $title_string .= trim($linestr) . " "; } elseif ( (left($linestr,1) != "\t") and ($in_title_flag) ) { // title section has ended, add the string accumulated so far to the array with key "TITLE". $curr_ref["TITLE"] = $title_string; $title_string = ""; $in_title_flag = FALSE; } // Handles REF_JOURNAL field (assume to be single line) if (left($linestr,11) == "REF_JOURNAL") { $journal = trim(substr($linestr,11)); $curr_ref["JOURNAL"] = $journal; } // Handles REF_VOLUME field (assume to be single line) if (left($linestr,10) == "REF_VOLUME") { $volume = trim(substr($linestr,10)); $curr_ref["VOLUME"] = $volume; } // Handles REF_PAGES field (assume to be single line) if (left($linestr,9) == "REF_PAGES") { $pages = trim(substr($linestr,9)); $curr_ref["PAGES"] = $pages; } // Handles REF_YEAR field (assume to be single line) if (left($linestr,8) == "REF_YEAR") { $year = trim(substr($linestr,8)); $curr_ref["YEAR"] = $year; } if (left($linestr,2) == "//") { // end of this REFERENCE set within REF section, AND end of record as well. // Add the previous reference set ($curr_ref array) to the big $references array. $reference[] = $curr_ref; // initialize author_string and title_string. placed it here in case we use // this function to parse more than one record at a time. $curr_ref = array(); $author_string = ""; $title_string = ""; $in_ref_flag = FALSE; $in_record_flag = FALSE; break; } } // closes if ($in_ref_flag) } // closes outermost WHILE loop. // Code that causes this function to return a GENOME object, with values from parsed file. // Placed here so it can be easily modified should the GENOME class change, or should we // we want to return something else (e.g. array, string, another kind of object, etc.) $oGenome = new Genome(); $oGenome->organism = $organism; $oGenome->common_name = $common_name; $oGenome->tax_class = $tax_class; $oGenome->is_complete = $is_complete; $oGenome->gb_release = $gb_release; $oGenome->gb_entries = $gb_entries; $oGenome->gb_bps = $gb_bps; $oGenome->size = $size; $oGenome->reference = $reference; return $oGenome; } // closes function parse_genome_dogs() ?> =============================================== HGBASE.INC.PHP - SOURCE CODE <?php require_once("seqdb.inc.php"); require_once("etc.inc.php"); /* hgbase.inc.php - include file containing the Mutation class (based on HGBase). Description: Contains the definition of the above classes and some helper functions. Author: Serge Gregorio, Jr. Date: May 27, 2003 License: General Public License 2.0 This code has been written as part of the GenePHP/BioPHP project, located at: http://genephp.sourceforge.net */ class Mutation { var $haplotype_id; var $allele; var $is_in_block; var $population_id; var $pop_name; var $pop_indiv; var $freq_perc; var $freq_indiv; var $source_id; var $citation; var $submitter_name; var $submission_id; var $source_comment; var $mesh; } function parse_mutation_hgbase($flines) { $citation = ""; while( list($lineno, $linestr) = each($flines) ) { // OPENS outermost while() loop // removes any \t after the label and before the data. // Original string: // haplotypeid\s\s\s\tdata_is_here // Result: // haplotypeid\s\s\sdata_is_here or // haplotypeiddata_is_here $linestr = stripcslashes($linestr); // $label = strtoupper(trim($line_array[0])); // strtoupper(trim(left($linestr, 16))); // $data = substr($linestr,16); // $data = $line_array[1]; // Assume that ENTRY is always one string in one line. if (strtoupper(left($linestr,11)) == "HAPLOTYPEID") $haplotype_id = trim(substr($linestr,11)); // ALLELE data field. if (strtoupper(left($linestr,6)) == "ALLELE") {} // ISINBLOCK data field if (strtoupper(left($linestr,9)) == "ISINBLOCK") {} // POPULATIONID data field if (strtoupper(left($linestr,12)) == "POPULATIONID") $population_id = trim(substr($linestr,12)); if (strtoupper(left($linestr,10)) == "POPULATION") { $data = trim(substr($linestr,10)); // Caucasian (USA) (216 individuals) $pop_array = preg_split("/\(/", trim($data), -1, PREG_SPLIT_NO_EMPTY); // Caucasian, USA), 216 individuals) $pop_str = ""; $ctr = 0; foreach($pop_array as $item) { $ctr++; if ($ctr == count($pop_array)) break; if (right($item,1) == ")") $item = "(" . $item; $pop_str .= $item; } $pop_name = $pop_str; $ind_array = preg_split("/\s+/", array_pop($pop_array), -1, PREG_SPLIT_NO_EMPTY); if (strtoupper(left($ind_array[1],10)) == "INDIVIDUAL") $pop_indiv = (int) ($ind_array[0]); } if (strtoupper(left($linestr,9)) == "FREQUENCY") { $data = trim(substr($linestr,9)); $perc_array = preg_split("/%/", trim($data), -1, PREG_SPLIT_NO_EMPTY); // 2, (1039 individuals) $freq_perc = (float) $perc_array[0]; array_shift($perc_array); // " (1039 individuals) (more nonsense) " array_walk($perc_array, "trim_element"); // "(1039 individuals) (more nonsense)" $freq_array = preg_split("/\s+/", $perc_array[0], -1, PREG_SPLIT_NO_EMPTY); // "(1039", "individuals)", ... if (left($freq_array[0],1) == "(") $freq_indiv = substr($freq_array[0],1); } // SOURCEID data field. Assume to be one entry in one line. if (strtoupper(left($linestr,8)) == "SOURCEID") $source_id = trim(substr($linestr,8)); // CITATION data field. Assume to be one entry in one line. if (strtoupper(left($linestr,8)) == "CITATION") { $linestr = ereg_replace("\t", ' ', $linestr); $citation .= trim(substr($linestr,8)) . " "; } // SUBMITTER data field. Assume to be one line only. if (strtoupper(left($linestr,9)) == "SUBMITTER") { $data = trim(substr($linestr,9)); // Jan. W. Koper, SUB0001234) $submit_array = preg_split("/\(/", trim($data), -1, PREG_SPLIT_NO_EMPTY); $submitter_name = trim($submit_array[0]); $submission_id = trim($submit_array[1]); $submission_id = left($submission_id, strlen($submission_id)-1); } // SOURCECOMMENT data field. Assume to be one string in one line. if (strtoupper(left($linestr, 13)) == "SOURCECOMMENT") $source_comment = trim(substr($linestr,13)); // Exit while loop when an end-of-entry marker is found. if (left($linestr,2) == "//") break; } $oMutation = new Mutation(); $oMutation->haplotype_id = $haplotype_id; // $oMutation->allele = $allele; // $oMutation->is_in_block = $is_in_block; $oMutation->population_id = $population_id; $oMutation->pop_name = $pop_name; $oMutation->pop_indiv = $pop_indiv; $oMutation->freq_perc = $freq_perc; $oMutation->freq_indiv = $freq_indiv; $oMutation->source_id = $source_id; $oMutation->citation = $citation; $oMutation->submitter_name = $submitter_name; $oMutation->submission_id = $submission_id; $oMutation->source_comment = $source_comment; // $oMutation->mesh = $aMesh; return $oMutation; } ?> =============================================== LIT.INC.PHP - SOURCE CODE <?php require_once("seqdb.inc"); require_once("etc.inc"); /* lit.inc - include file containing the classes for Biomedical literature such as journals, etc. Description: Contains the definition of the above classes and some helper functions. Author: Serge Gregorio, Jr. Date: May 11, 2003 License: General Public License 2.0 This code has been written as part of the GenePHP/BioPHP project, located at: http://genephp.sourceforge.net */ class Lit { } /* JrId: 1 JournalTitle: AADE editors' journal. MedAbbr: AADE Ed J ISSN: 0160-6999 ESSN: --- IsoAbbr: --- NlmId: 7708172 */ class Journal { var $id; var $title; var $med_abbr; var $issn; var $essn; var $iso_abbr; var $nlm_id; } function parseall_journal_ncbilit($fp) { while( list($lineno, $linestr) = each($flines) ) { $oJournal = new Journal; $oJournal = parse_journal_ncbi($flines); } } function parse_journal_ncbilit($flines) { while( list($lineno, $linestr) = each($flines) ) { // OPENS outermost while() loop // For now, assume that there are no multi-line data fields. if (left($linestr,10) == "----------") { // we detect the end of entry/record marker, start a new record. break; } else { $line_r = preg_split("/: /", $linestr, -1, PREG_SPLIT_NO_EMPTY); $label = trim($line_r[0]); $value = trim($line_r[1]); /* print $label . " : " . $value; print "<BR>"; */ $$label = $value; } } $oJournal = new Journal(); $oJournal->id = $JrId; $oJournal->title = $JournalTitle; $oJournal->med_abbr = $MedAbbr; $oJournal->issn = $ISSN; $oJournal->essn = $ESSN; $oJournal->iso_abbr = $IsoAbbr; $oJournal->nlm_id = $NlmId; return $oJournal; } /* function parse_compound_kegg($flines) { // Initialization of variables. $in_name_flag = FALSE; $aNames = array(); $name_string = ""; $in_path_flag = FALSE; $path_string = ""; $aPaths = array(); $in_react_flag = FALSE; $react_string = ""; $in_enzyme_flag = FALSE; $enzyme_string = ""; $in_dblink_flag = FALSE; $aDblinks = array(); while( list($lineno, $linestr) = each($flines) ) { // OPENS outermost while() loop $label = trim(left($linestr, 12)); // Assume that ENTRY is always one line. if ($label == "ENTRY") $entry = trim(substr($linestr, 12)); // NAME entry is made up of one or more names, the preferred name is at // the first line, other alternative names are in succeeding lines. It // is possible for a long name to occupy two or more lines. But for now, // let's assume one name in one line. if ($label == "NAME") { $aNames = array(); $aNames[] = trim(substr($linestr,12)); $in_name_flag = TRUE; } elseif ( (strlen($label) == 0) and ($in_name_flag) ) { $aNames[] = trim(substr($linestr,12)); } elseif ( (strlen($label) > 0) and ($in_name_flag) ) { $in_name_flag = FALSE; } */ ?> =============================================== MOTIF.INC.PHP - SOURCE CODE <?php class Motif { var $entry_name; var $entry_type; var $accession; var $date; var $desc; var $pattern; var $matrix; var $rule; var $num_results; var $comments; var $swp_xref; var $pdb_xref; var $doc_xref; } function parse_motif_prosite($flines) { $in_desc_flag = FALSE; $desc_string = ""; $in_dr_flag = FALSE; $dr_string = ""; $aDBRefs = array(); $in_pa_flag = FALSE; $pa_string = ""; $in_ru_flag = FALSE; $ru_string = ""; $in_3D_flag = FALSE; $three_d_string = ""; $aPDB_names = array(); $doc_string = ""; $in_nr_flag = FALSE; $nr_string = ""; $aNRs = array(); $in_cc_flag = FALSE; $cc_string = ""; $aCCs = array(); $in_ma_flag = FALSE; $ma_string = ""; $aInner = array(); $aOuter = array(); while ( list($no, $linestr) = each($flines) ) { // opens outermost WHILE $label = left($linestr, 2); $data = trim(substr($linestr, 5)); $lascar = right($linedata, 1); // ID data field if ($label == "ID") { $words = preg_split("/;/", substr($linestr, 5)); $entry_name = trim($words[0]); $entry_type = trim($words[1]); $type_len = strlen($entry_type); $entry_type = substr($entry_type, 0, $type_len-1); } // AC - ACCESSION data field if ($label == "AC") $accession = substr($data, 0, strlen($data)-1); // DT - DATE data field (Note: Later, decide if you want to replace key "DATA UPDATE" with "DATA_UPDATE", // and "INFO UPDATE" with "INFO_UPDATE", or not. Right now, I'm using the key with the inner whitespace. if ($label == "DT") { $date_array = preg_split("/;/", $data, -1, PREG_SPLIT_NO_EMPTY); $aDates = array(); $counter = 1; foreach($date_array as $date_item) { $temp = preg_split("/\s\(/", $date_item, -1, PREG_SPLIT_NO_EMPTY); // May 20, 2003: Last key was "INFO UPDATE)". I fixed this with an IF stmt. if ($counter == 3) $key = substr($temp[1], 0, strlen($temp[1])-2); else $key = substr($temp[1], 0, strlen($temp[1])-1); $val = $temp[0]; $aDates[$key] = trim($val); $counter++; } } // DE - DESCRIPTION data field (assume that DE may be one or more lines to be connected by a whitespace). if ($label == "DE") { $desc_string .= $data . " "; $in_desc_flag = TRUE; } elseif ($in_desc_flag) { // we've encountered a line that is not DE (after one or more DE's). store accumulated string to a var. $description = trim($desc_string); } // PA - PATTERN data field - may be one or more lines connected to be each other without whitespaces. if ($label == "PA") { $pa_string .= $data; $in_pa_flag = TRUE; } elseif ($in_pa_flag) { $pattern = trim($pa_string); } // MA - MATRIX data field - skip this for now. /* SAMPLE ENTRY: MA /GENERAL_SPEC: ALPHABET='ACDEFGHIKLMNPQRSTVWY'; LENGTH=97; MA /DISJOINT: DEFINITION=PROTECT; N1=2; N2=96; MA /NORMALIZATION: MODE=1; FUNCTION=GLE_ZSCORE; MA R1=239.0; R2=-0.0036; R3=0.8341; R4=1.016; R5=0.169; */ if ($label == "MA") { $ma_string .= $data . " "; $in_ma_flag = TRUE; } elseif ($in_ma_flag) { $ma = trim($ma_string); $ma_r = preg_split("/\//", $ma, -1, PREG_SPLIT_NO_EMPTY); $aOuter = array(); foreach($ma_r as $ma_item) { $ma_qv = preg_split("/:/", $ma_item, -1,PREG_SPLIT_NO_EMPTY); array_walk($ma_qv, "trim_element"); // $ma_qv = ( "GENERAL_SPEC", "X=1; Y=2;" ) $qualifier = $ma_qv[0]; $values_r = preg_split("/;/", $ma_qv[1], -1, PREG_SPLIT_NO_EMPTY); array_walk($values_r, "trim_element"); // $values_r = ( "X=1", "Y=2" ) $aInner = array(); foreach($values_r as $value_item) { $qv = preg_split("/=/", $value_item, -1, PREG_SPLIT_NO_EMPTY); array_walk($qv, "trim_element"); $inner_qual = $qv[0]; $inner_value = $qv[1]; $aInner[$inner_qual] = $inner_value; } // $aInner = ( "X" => 1, "Y" => 2 ) $aOuter[$qualifier] = $aInner; // $aOuter[] = ( "GENERAL_SPEC" => ( "X" => 1, "Y" => 2 ), "NORMALIZATION" => ( "A" => 1), ... ) } } // NR - NUMERICAL RESULTS data field. if ($label == "NR") { $nr_string .= $data . " "; $in_nr_flag = TRUE; } elseif ($in_nr_flag) { $nr = trim($nr_string); $nr_array = preg_split("/;/", $nr, -1, PREG_SPLIT_NO_EMPTY); array_walk($nr_array, "trim_element"); $aNRs = array(); foreach($nr_array as $nr_item) { $nr_qv = preg_split("/=/", $nr_item, -1, PREG_SPLIT_NO_EMPTY); $qualifier = trim($nr_qv[0]); $value = trim($nr_qv[1]); $aNRs[$qualifier] = $value; } } // CC - COMMENT data field - may be one or more lines. contains qualifiers and values. if ($label == "CC") { $cc_string .= $data . " "; $in_cc_flag = TRUE; } elseif ($in_cc_flag) { $cc = trim($cc_string); $cc_array = preg_split("/;/", $cc, -1, PREG_SPLIT_NO_EMPTY); array_walk($cc_array, "trim_element"); $aCCs = array(); foreach($cc_array as $cc_item) { $cc_qv = preg_split("/=/", $cc_item, -1, PREG_SPLIT_NO_EMPTY); $qualifier = trim($cc_qv[0]); $value = trim($cc_qv[1]); $aCCs[$qualifier] = $value; } } // RU - RULES data field - may be one or more lines. Free-format text, multiple lines to be // connected with a whitespace character. if ($label == "RU") { $ru_string .= $data . " "; $in_ru_flag = TRUE; } elseif ($in_ru_flag) { $rule = trim($ru_string); } // 3D - 3D STRUCTURE data field - may be one or more lines, to be connected by a whitespace. if ($label == "3D") { $three_d_string .= $data . " "; $in_3D_flag = TRUE; } elseif ($in_3D_flag) { $three_d = trim($three_d_string); $aPDB_names = preg_split("/;/", $three_d, -1, PREG_SPLIT_NO_EMPTY); array_walk($aPDB_names, "trim_element"); } // DR - DATABASE REFERENCES data field if ($label == "DR") { $dr_string .= $data . " "; $in_dr_flag = TRUE; } elseif ($in_dr_flag) { // we've encountered a line that is not DE (after one or more DE's). store accumulated string to a var. $dr = trim($dr_string); $dr_array = preg_split("/;/", $dr, -1, PREG_SPLIT_NO_EMPTY); foreach($dr_array as $dr_item) { $temp = preg_split("/,/", $dr_item, -1, PREG_SPLIT_NO_EMPTY); array_walk($temp, "trim_element"); $aDBRefs[] = $temp; } // May 20, 2003: You forgot this line which caused the array to triple in size (with duplicate entries). $in_dr_flag = FALSE; } // DO - DOCUMENTATION data field - exactly one entry in one line, terminated by a semi-colon (;). if ($label == "DO") { $doc_string = substr($data, 0, strlen($data)-1); } if ($label == "//") break; } // closes outermost WHILE $oMotif = new Motif(); $oMotif->entry_name = $entry_name; $oMotif->entry_type = $entry_type; $oMotif->accession = $accession; $oMotif->date = $aDates; $oMotif->desc = $description; $oMotif->swp_xref = $aDBRefs; $oMotif->pattern = $pattern; $oMotif->rule = $rule; $oMotif->pdb_xref = $aPDB_names; $oMotif->doc_xref = $doc_string; $oMotif->num_results = $aNRs; $oMotif->comments = $aCCs; $oMotif->matrix = $aOuter; return $oMotif; } ?> =============================================== PDBSTR.INC.PHP - SOURCE CODE <?php //pdbstr.inc.php require_once("etc.inc.php"); require_once("seqdb.inc.php"); require_once("seq.inc.php"); class Protein_PDBSTR { // MEMBER section var $entry_id; var $moltype; var $length; var $entry_group; var $create_date; var $upd_date; } function parse_protein_pdbstr($flines) { // initialize variables here while ( list($no, $linestr) = each($flines) ) { $linelabel = trim(substr($linestr,0,12)); $linedata = trim(substr($linestr,12)); /* MEMBER section - for now, assume that all data items on this line are mandatory. Example: MEMBER 1YIC_01 108 PROTEIN 1YIC 97/02/18 97/07/23 */ if ($linelabel == "MEMBER") { $member_tokens = preg_split("/\s+/", $linedata, -1, PREG_SPLIT_NO_EMPTY); array_walk($member_tokens, "trim_element"); $entry_id = $member_tokens[0]; $length = $member_tokens[1]; $moltype = $member_tokens[2]; $entry_group = $member_tokens[3]; $create_date = $member_tokens[4]; $upd_date = $member_tokens[5]; } if ($linelabel == "//") break; } $oProtein = new Protein_PDBSTR(); $oProtein->entry_id = $entry_id; $oProtein->moltype = $moltype; $oProtein->length = $length; $oProtein->entry_group = $entry_group; $oProtein->create_date = $create_date; $oProtein->upd_date = $upd_date; return $oProtein; } ?> =============================================== PIR.INC.PHP - SOURCE CODE <?php require_once("etc.inc.php"); require_once("seq.inc.php"); class PIRSeq { // NOTE: This is not yet a complete listing of all properties! var $entry_name; var $entry_type; var $title; var $accession; var $organism; // common name of source organism var $species; // scientific name of source organism var $create_date; var $seqrev_date; var $txtchg_date; var $length; var $molwt; var $checksum; var $keywords; } // parse_protein_pir_codata() parses a PIR (Codata) data file and returns a // PIRSeq object containing parsed data. function parse_protein_pir_codata($flines, $sql_db = "NONE") { /* $seqarr = array(); $inseq_flag = false; $seqdata_flag = false; $accession_flag = false; $ref_array = array(); $feature_array = array(); $entry_ctr = 0; $ref_ctr = 0; $maxlength = 0; $minlength = 999999; $tot_seqlength = 0; $in_source_flag = FALSE; $source_string = ""; .) $aTaxonomy = array(); $tax_string = ""; $in_organism_flag = FALSE; $wordarray = array(); $keywords_string = ""; $in_keywords_flag = FALSE; */ // PIR starts here $title_flag = FALSE; $title_string = ""; $orgasm_flag = FALSE; $orgasm_string = ""; $date_flag = FALSE; $date_string = ""; $acc_flag = FALSE; $acc_string = ""; $kw_flag = FALSE; $kw_string = ""; while( list($lineno, $linestr) = each($flines) ) { // OPENS outermost while( list($lineno, $linestr) = each($flines) ) $linelabel = trim(substr($linestr, 0, 16)); $linedata = trim(substr($linestr, 16)); /* ENTRY data field - contains $entry_name and $entry_type data items. Syntax: ENTRY entry_name #key1 val1 #key2 val2 ... Example: ENTRY RHTDTO #type complete ENTRY RHPGT #type complete ENTRY I50412 #type fragment */ if ($linelabel == "ENTRY") { $entry_ctr++; $ref_ctr = 0; $ref_array = array(); // This is the beginning of a SEQUENCE ENTRY. $seqdata = ""; // ENTRY entry_name #key1 val1 #key2 val2 ... // preg_splitting this by the # symbol would produce this: // entry_name, key1 val1, key2 val2 $line_tokens = preg_split("/#/", $linedata, -1, PREG_SPLIT_NO_EMPTY); // Assume that first token is always the entry name. $entry_name = $line_tokens[0]; // remove the first item from the array (rep. entry name) and process // the succeeding key-value pairs. array_shift($line_tokens); foreach($line_tokens as $keyval) { $keyval_tokens = preg_split("/\s+/", $keyval, -1, PREG_SPLIT_NO_EMPTY); $key = $keyval_tokens[0]; // remove the first item from array (rep. the key name), leaving the key values. array_shift($keyval_tokens); // rebuild the value, joining them with a whitespace character. $val = implode(" ", $keyval_tokens); $aEntry[$key] = $val; } $entry_type = $aEntry["type"]; $inseq_flag = true; } /* TITLE data field - contains $title data item - may be multi-line. Example: TITLE R-phycoerythrin alpha-1 chain - red alga (Gastroclonium coulteri) (fragment) */ if ($linelabel == "TITLE") { $title_string .= $linedata . " "; $title_flag = TRUE; } elseif ( ($linelabel == "") and ($title_flag) ) $title_string .= $linedata . " "; elseif ( ($linelabel != "") and ($title_flag) ) { $title = trim($title_string); $title_flag = FALSE; $title_string = ""; } /* ORGANISM data field - may be multiline. Example: ORGANISM #formal_name Oryctolagus cuniculus #common_name domestic rabbit */ if ($linelabel == "ORGANISM") { $orgasm_string .= $linedata . " "; $orgasm_flag = TRUE; } elseif ( ($linelabel == "") and ($orgasm_flag) ) $orgasm_string .= $linedata . " "; elseif ( ($linelabel != "") and ($orgasm_flag) ) { $organism = trim($orgasm_string); // formal_name blah blah, common_name yakity yak $orgasm_tokens = preg_split("/#/", $organism, -1, PREG_SPLIT_NO_EMPTY); foreach($orgasm_tokens as $keyval) { $keyval_tokens = preg_split("/\s+/", $keyval, -1, PREG_SPLIT_NO_EMPTY); $key = $keyval_tokens[0]; // remove the first item from array (rep. the key name), leaving the key values. array_shift($keyval_tokens); // rebuild the value, joining them with a whitespace character. $val = implode(" ", $keyval_tokens); $aEntry[$key] = $val; } $organism = $aEntry["common_name"]; $species = $aEntry["formal_name"]; $orgasm_flag = FALSE; $orgasm_string = ""; } /* DATE data field - may be multiline. Handle the same way as ORGANISM. Example: DATE 15-Jun-2001 #sequence_revision 15-Jun-2001 #text_change 15-Jun-2001 */ if ($linelabel == "DATE") { $date_string .= $linedata . " "; $date_flag = TRUE; } elseif ( ($linelabel == "") and ($date_flag) ) $date_string .= $linedata . " "; elseif ( ($linelabel != "") and ($date_flag) ) { $date = trim($date_string); // create_date #key1 dateval1 #key2 dateval2 $date_tokens = preg_split("/#/", $date, -1, PREG_SPLIT_NO_EMPTY); $create_date = $date_tokens[0]; array_shift($date_tokens); // $date_tokens after array_shift: key1 dateval1, key2 dateval2 foreach($date_tokens as $keyval) { $keyval_tokens = preg_split("/\s+/", $keyval, -1, PREG_SPLIT_NO_EMPTY); $key = $keyval_tokens[0]; // remove the first item from array (rep. the key name), leaving the key values. array_shift($keyval_tokens); // rebuild the value, joining them with a whitespace character. $val = implode(" ", $keyval_tokens); $aEntry[$key] = $val; } $seqrev_date = $aEntry["sequence_revision"]; $txtchg_date = $aEntry["text_change"]; $date_flag = FALSE; $date_string = ""; } /* ACCESSIONS data field - may have one or more accessions separated by semicolon (;), may be multiline. Example: ACCESSIONS PT0622; PT0680; PT0582; PT0673 */ if ($linelabel == "ACCESSIONS") { $acc_string .= $linedata . " "; $acc_flag = TRUE; } elseif ( ($linelabel == "") and ($acc_flag) ) $acc_string .= $linedata . " "; elseif ( ($linelabel != "") and ($acc_flag) ) { $accession = trim($acc_string); $acc_tokens = preg_split("/;/", $accession, -1, PREG_SPLIT_NO_EMPTY); array_walk($acc_tokens, "trim_element"); $acc_flag = FALSE; $acc_string = ""; } /* KEYWORDS data field - similar to ACCESSIONS in format and handling Example: KEYWORDS amidated carboxyl end; cutaneous gland; hormone; pyroglutamic acid */ if ($linelabel == "KEYWORDS") { $kw_string .= $linedata . " "; $kw_flag = TRUE; } elseif ( ($linelabel == "") and ($kw_flag) ) $kw_string .= $linedata . " "; elseif ( ($linelabel != "") and ($kw_flag) ) { $keywords = trim($kw_string); $kw_tokens = preg_split("/;/", $keywords, -1, PREG_SPLIT_NO_EMPTY); array_walk($kw_tokens, "trim_element"); $kw_flag = FALSE; $kw_string = ""; } /* SUMMARY data field - contains (sequence) length, molecular weight, and checksum data items. We can safely assume that this is always exactly one line. Example: SUMMARY #length 3 #molecular-weight 380 #checksum 465 */ if ($linelabel == "SUMMARY") { $sum_tokens = preg_split("/#/", $linedata, -1, PREG_SPLIT_NO_EMPTY); // result: length 3, molecular-weight 380, checksum 465 foreach($sum_tokens as $keyval) { $keyval_tokens = preg_split("/\s+/", $keyval, -1, PREG_SPLIT_NO_EMPTY); $key = $keyval_tokens[0]; // remove the first item from array (rep. the key name), leaving the key values. array_shift($keyval_tokens); // rebuild the value, joining them with a whitespace character. $val = implode(" ", $keyval_tokens); $aEntry[$key] = $val; } $molwt = (float) $aEntry["molecular-weight"]; $length = (int) $aEntry["length"]; $checksum = $aEntry["checksum"]; } // End of record (EOR) marker if ($linelabel == "//") break; } // CLOSES outermost while( list($lineno, $linestr) = each($flines) ) $oPIRSeq = new PIRSeq(); $oPIRSeq->entry_name = $entry_name; $oPIRSeq->entry_type = $entry_type; $oPIRSeq->title = $title; $oPIRSeq->organism = $organism; $oPIRSeq->species = $species; $oPIRSeq->create_date = $create_date; $oPIRSeq->seqrev_date = $seqrev_date; $oPIRSeq->txtchg_date = $txtchg_date; $oPIRSeq->accession = $acc_tokens; $oPIRSeq->keywords = $kw_tokens; $oPIRSeq->molwt = $molwt; $oPIRSeq->length = $length; $oPIRSeq->checksum = $checksum; /* $oPIRSeq->moltype = $seqobj_moltype; $oPIRSeq->seqlength = $seqobj_seqlength; $oPIRSeq->date = $seqobj_date; $oPIRSeq->strands = $seqobj_strands; $oPIRSeq->topology = $seqobj_topology; $oPIRSeq->division = $seqobj_division; $seqobj->seqarray = $seqarr; */ return $oPIRSeq; } // Closes parse_protein_pir_codata() function definition ?>

[ Home Page ] [ I/O Scripts Page ]

 


Copyright © 2003 by Sergio Gregorio, Jr.
All rights reserved.