BioPHP: PHP for Biocomputing


[ Home Page ] [ I/O Scripts Page ]

Source Code Listing of etc.inc.php
(Include file that contains helper functions)

Note: This is part of BioPHP 1.1 alpha code set. The code, which is approximately 600+ lines long, is still rough. Improvements to the
code are welcome!
<?php /* ETC.INC contains definitions for the SubMatrix and SeqMatch classes. It also contains helper functions such as is_blank(), isa_qualifier(), firstchar(), left(), etc. */ // ============================================================= // Newly added functions (version 1.1) // June 21, 2003: Added the three functions below used when writing data as flatfile. function linechop($string, $wid, $prefix = "") { $result = ""; $i = 0; while(1) { $i++; $left = substr($string, 0, $wid); $right = substr($string, $wid); if ($i == 1) $result .= $left . "\n"; else $result .= $prefix . $left . "\n"; $string = $right; if (($string == FALSE) or (trim($string) == "") ) { // remove the last \n from $result if (right($result,2) == "\n\n") $result = left($result, strlen($result)-1); break; } } return $result; } /* Assume that $phrase is made up of words separated by a single (not multiple) whitespace character. VALID: "The quick brown fox" WRONG: "The quick brown fox" */ function right_word($phrase, $no = 1) { // split phrase into words or tokens $ph_words = preg_split("/\s/", $phrase); if ($no == 1) return (array_pop($ph_words)); } function remove_word($phrase, $word_no = "last") { // split phrase into words or tokens $ph_words = preg_split("/\s/", $phrase); if ($word_no == "last") // remove the last word from $ph_words $last_word = array_pop($ph_words); // join the remaining words with a space $new_phrase = implode(" ", $ph_words); return $new_phrase; } function linechopw($string, $wid, $prefix = "") { $result = ""; $i = 0; while(1) { $i++; $left = substr($string, 0, $wid); $right = substr($string, $wid); if ( ($right) and (right($left,1) != " ") and (left($right,1) != " ") ) { // we're splitting a whole word $last_word = right_word($left); $left = remove_word($left); $right = ltrim($last_word . $right); } else $right = ltrim($right); if ($i == 1) $result .= $left . "\n"; else $result .= $prefix . $left . "\n"; $string = $right; if ( ($string == FALSE) or (trim($string) == "") ) break; } return $result; } function multientry_pad($array, $padchar, $padwid, $padopt, $entries_per_line, $prefix = "") { $result = ""; $entry_ctr = 0; $line_ctr = 0; $line_change = FALSE; foreach($array as $entry) { $entry_ctr++; if ($line_change) { $result .= $prefix; $line_change = FALSE; } $result .= str_pad(trim($entry), $padwid, $padchar, $padopt); if ($entry_ctr == $entries_per_line) { $result .= "\n"; $entry_ctr = 0; $line_change = TRUE; } } return $result; } /* ASSUMPTIONS: By its name, this function is fixed to DBGET of course. But the parameters (DBNAME and DBID) can be set by the user. I've put the address (http://www.genome.ad.jp...) inside dbget(). It can be placed outside as well. */ function dbget($dbname, $dbid, $temp_file) { // define constants for DBGET system $dbget_url = "http://www.genome.ad.jp/dbget-bin/www_bget"; // debugger echo code /* print "$dbget_url?$dbname+$dbid"; print "<BR>"; print $temp_file; die("STOP MUNA"); */ // This block does the work of retrieving data from $dbname via DBGET and saving // the results as a file named $temp_file in the folder $temp_dir. $ch = curl_init("$dbget_url?$dbname+$dbid"); $fp = fopen($temp_file, "w+"); if ($fp == FALSE) die("Error in opening temporary file. Script aborted."); curl_setopt ($ch, CURLOPT_FILE, $fp); curl_setopt ($ch, CURLOPT_HEADER, 0); $ok = curl_exec ($ch); if ($ok == FALSE) die("CURL error: Script aborted!"); curl_close ($ch); fclose($fp); return TRUE; } function dbgetfile2r($temp_file) { // This stores entire DBGET temp file (HTML) into one string, removes HTML tags... $flines = file($temp_file); $flinestr = implode(" ", $flines); $flinestr = strip_tags($flinestr); // ...and writes the result into a second file, with the *.notags extension. $file2 = $temp_file . ".notags"; $fp2 = fopen($file2, "w+"); fputs($fp2, $flinestr); fclose($fp2); $flines = file($file2); $new_flines = array(); $line_ctr = 0; // This loop searches for the line containing the ID data field. foreach($flines as $linestr) { if ( ($posit = strpos($linestr, "ID ")) > 0 ) { $new_flines[] = trim(substr($linestr, $posit)); $start_line = $line_ctr; break; } $line_ctr++; } // remove "junk data/lines" from the $flines array for($i = 0; $i <= $start_line; $i++) array_shift($flines); reset($flines); // start loading data into $flines array foreach($flines as $linestr) { $linestr = ltrim($linestr); $new_flines[] = preg_replace("/\n/", '', $linestr); if (trim($linestr) == "//") break; } return ($new_flines); } // file_verb is on EXPERIMENTAL STATUS. May be removed later. function file_verb($filetype, $operation) { switch($filetype) { case "FILE": return ($operation == "R" ? "parse" : "cat"); case "VAR": return ($operation == "R" ? "eval" : "assign"); case "SQL": return ($operation == "R" ? "select" : "insert"); case "STDIN": return "input"; case "STDOUT": return "output"; } } function get_parsefunc($object, $format) { $object = strtolower($object); $format = strtolower(get_shortformat($format)); return "parse_" . $object . "_" . $format; } function get_shortformat($format) { $format = strtoupper($format); switch($format) { case "GENBANK": return "GB"; case "SWISSPROT": return "SWP"; default: return $format; } } function insert_tagpair($line, $opentag, $closetag, $begcol, $endcol) { $begstrpos = col2strpos($line, $begcol); $endstrpos = col2strpos($line, $endcol); $left = substr($line, 0, $begstrpos); $mid = substr($line, $begstrpos, $endstrpos-$begstrpos+1); $right = substr($line, $endstrpos+1); return ($left . $opentag . $mid . $closetag . $right); } function col2strpos($seq, $col) { $strip = strip_tags($seq); $strpos = array(); $opentag = FALSE; $closetag = FALSE; for($i = 0; $i < strlen($seq); $i++) { $sym = substr($seq, $i, 1); if ($sym == "<") { $intag = TRUE; continue; } if ($sym == ">") { $intag = FALSE; continue; } if ($intag) continue; $strpos[] = $i; } return $strpos[$col]; } // returns an array containing the line number column no. function pos2rc($pos) { $rowcol = array(); $row = ((int) ($pos / 60)); $rowcol[] = $row; $offset = $pos - (($row) * 60); $decade = ((int) ($offset / 10)); $col = $offset + $decade + 10; $rowcol[] = $col; return $rowcol; } // ============================================================= $patterndb = array("_StartCodon" => "AUG", "_EndCodon" => "[UAA,UAG,UGA]"); // trim_element() removes leading and trailing spaces from a string. In conjunction // with the array_walk() function, it removes spaces from each element of an array. function trim_element(&$value, $key) { $value = trim($value); } // June 10, 2003: Added this function which adds a slash for special chars to each // element in an array. function addslashes_element(&$value, $key) { $value = addslashes($value); } function stripslashes_element(&$value, $key) { $value = stripslashes($value); } /* is_false() tests if a value is a boolean false and not a zero (0). This is necessary to correctly interpret the return value of some PHP functions like strpos(). strpos() returns a zero (0) if a string is found at the beginning of a larger string, and FALSE if it cannot find that string within the larger string. In PHP, FALSE equals 0. */ function is_false($value) { if ( (gettype($value) == "boolean") and ($value == FALSE) ) return TRUE; else return FALSE; } // rem_right() removes $charcount characters from the right (end) of a string. function rem_right($str, $charcount = 1) { return substr($str, 0, strlen($str)-$charcount); } // intrim() removes "internal spaces" (as opposed to leading and trailing spaces) from a string. function intrim($string) { return eregi_replace(' ', '', $string); } // getmin() gets the minimum of three (usually numeric) values $x, $y, and $z. // For now, this can't handle situations when one or more arguments is FALSE. function getmin($x, $y, $z) { if ($x < $y) if ($x < $z) return $x; else return $z; else if ($y < $z) return $y; else return $z; } // is_even() tests if an integer is an even number. function is_even($integer) { if (($integer/2) == ((int) ($integer/2))) return TRUE; else return FALSE; } // is_odd() tests if an integer is an odd number. This is the opposite of is_even(). function is_odd($integer) { if (($integer/2) != ((int) ($integer/2))) return TRUE; else return FALSE; } // is_blankstr() tests if a value is a blank string (""). Like is_false(), this // helps interpret the value of some PHP functions or expressions. function is_blankstr($var) { if ( (gettype($var) == "string") and ($var == "") ) return TRUE; else return FALSE; } function is_notmt($var) { return ( !(is_blankstr($var)) ); } // I think this function should give way to or be replaced by is_blankstr(). // I haven't removed this yet as I have to check if some code still uses it. function is_blank($str) { if ($str == "") return true; else return false; } // firstchar() returns the first or beginning character of a string. function firstchar($str) { return left($str, 1); } // left() returns the first $numchars characters of a string. function left($str, $numchars) { return substr($str, 0, $numchars); } // right() returns the substring beginning at $numchars characters from the right end of a string. function right($str, $numchars) { return substr($str, strlen($str)-$numchars); } // compare_letter() compares two letters $let1 and $let2 and returns another letter // indicating if the two were exact matches, partial matches, or non-matches. function compare_letter($let1, $let2, $matrix, $equal, $partial = "+", $nomatch = ".") { global $chemgrp_matrix; // if no custom substitution matrix was provided, use the default. if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules; // if no symbol for exact matches was provided, use the residue symbol. if (isset($equal) == FALSE) $equal = $let1; if ($let1 == $let2) return $equal; elseif (partial_match($let1, $let2, $matrix)) return $partial; else return $nomatch; } /* Algorithm: We abbreviate substitution matrix to "submatrix". Each element in a submatrix is an array of symbols that are considered "partial matches" of each other. Default submatrix: ( ('G','A','V','L','I'), ('S','T'), ('N','Q'), ('F','Y','W'), ('C', 'M'), ('P'), ('D','E'), ('K','R','H'), ('*'), ('X') ) 1) Check if both $let1 and $let2 appear in the first element (G,A,V,L,I) of the substitution matrix. 2) If they are, you've found a "hit", and $let1 and $let2 are partial matches. Return a TRUE value. If they are not, then go to the next element in the substitution matrix. Repeat steps 1 and 2 until you reach a submatrix element where both $let1 and $let2 appear, or until the last element in the submatrix has been checked. 3) If you reach the last submatrix element without a "hit", return a FALSE value. NOTE: This will not warn if you $let1 and/or $let2 is nowhere to be found in the whole submatrix. */ function partial_match($let1, $let2, $matrix) { global $chemgrp_matrix; if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules; foreach($matrix as $rule) if ((in_array($let1, $rule)) and (in_array($let2, $rule))) return TRUE; return FALSE; } // getpattern() retrieves the pattern string from the pattern database ($patternDB array). function getpattern($pattern) { global $patterndb; return $patterndb[$pattern]; } // This class allows the use of customized substitution matrices. See tech doc for details. class submatrix { var $rules; // submatrix simply initializes the rules property to the empty array. function submatrix() { $this->rules = array(); } // addrule() adds a rule to the substitution matrix. function addrule($x) { $x = func_get_args(); // if (isset($this->rules) == FALSE) $this->rules = array(); array_push($this->rules, $x); } } class SeqMatch { var $result; var $hamdist; var $levdist; // hamdist() computes the Hamming Distance between two strings or Seq objects // of equal length. For more information, consult the technical reference. function hamdist($seq1, $seq2) { // If $seq1 is a Seq object, we use its sequence property to compute Hamming Distance. if (gettype($seq1) == "object") $string1 = $seq1->sequence; elseif (gettype($seq1) == "string") $string1 = $seq1; // If $seq2 is a Seq object, we use its sequence property to compute Hamming Distance. if (gettype($seq2) == "object") $string2 = $seq2->sequence; elseif (gettype($seq2) == "string") $string2 = $seq2; // We terminate code execution if the two strings differ in length. if (strlen($string1) != strlen($string2)) die("Both sequence must be of the same length!"); $len = strlen($string1); // Initialize the hamming distance to 0 (no difference between two strings). $distance = 0; // Match the two strings, character by character. If they are NOT // identical, increment $distance by 1. for($i = 0; $i < $len; $i++) { $let1 = substr($string1, $i, 1); $let2 = substr($string2, $i, 1); if ($let1 != $let2) $distance++; } return $distance; } // levdist() computes the Levenshtein Distance between two strings or Seq objects // with equal/unequal lengths. You can pass custom values for cost of insertion, // replacement, and deletion. If you don't pass any, they are assumed to be 1. // For more information, see technical reference. function levdist($seq1, $seq2, $cost_ins = 1, $cost_rep = 1, $cost_del = 1) { // If $seq1 is a Seq object, we use its sequence property to compute Levenshtein Distance. if (gettype($seq1) == "object") $string1 = $seq1->sequence; elseif (gettype($seq1) == "string") $string1 = $seq1; // If $seq2 is a Seq object, we use its sequence property to compute Levenshtein Distance. if (gettype($seq2) == "object") $string2 = $seq2->sequence; elseif (gettype($seq2) == "string") $string2 = $seq2; // Check the lengths of the two strings. If they exceed 255 characters, terminate code. if (strlen($string1) > 255) die("String length must not exceed 255 characters!"); if (strlen($string2) > 255) die("String length must not exceed 255 characters!"); // Compute and return the Levenshtein Distance using PHP's built-in levenshtein() function. return levenshtein($string1, $string2, $cost_ins, $cost_rep, $cost_del); } // xlevdist() is an extended version of levdist() which accepts strings with length // greater than 255 but not to exceed 1024 (which takes my CPU 18 seconds to compute). // The only drawback to xlevdist is that the cost of insertion, deletion, and replacement // is fixed to 1. I have yet to find a way to allow custom values for these. function xlevdist($s, $t) { $n = strlen($s); $m = strlen($t); if (($n > 1024) or ($m > 1024)) die("String length must not exceed 1024 characters"); // initialize the array $values = array(); $temp = array(); $temp[0] = 0; for($j = 1; $j <= $m; $j++) $temp[$j] = 0; $values[0] = $temp; for($i = 1; $i <= $n; $i++) $values[$i] = $temp; for($i = 1; $i <= $n; $i++) { // OPENS for($i = 1; $i <= $n; $i++) $lets = substr($s, $i-1, 1); for($j = 1; $j <= $m; $j++) { // OPENS for($j = 1; $j <= $m; $j++) $lett = substr($t, $j-1, 1); if ($lets == $lett) $cost = 0; else $cost = 1; // "normal" values of $up, $left, and $upleft if ($j > 1) $up = $values[$i][$j-1]; else $up = FALSE; if ($i > 1) $left = $values[$i-1][$j]; else $left = FALSE; if (($i > 1) and ($j > 1)) $upleft = $values[$i-1][$j-1]; else $upleft = FALSE; if ($i == 1) { if ($j == 1) $value = $cost; elseif ($cost == 0) $value = $cost; else $value = $up + 1; } else { // if at the first or topmost row, there is no upleft and above. if ($j == 1) { if ($cost == 0) $value = $cost; else $value = $left + 1; } else $value = getmin($up + 1, $left + 1, $upleft + $cost); } $values[$i][$j] = $value; } // CLOSES for($j = 1; $j <= $m; $j++) } // CLOSES for($i = 1; $i <= $n; $i++) return $values[$n][$m]; } // closes function xlevdist() /* The match() method accepts two sequence strings (not objects) of equal length, and returns a sequence match result string, according to the following rules: If there is an exact match, return the amino acid symbol. If there is a partial match, return a plus sign. If there is no match, return a whitespace character. */ function match($str1, $str2, $matrix, $equal, $partial = "+", $nomatch = ".") { global $chemgrp_matrix; // if the user chose not to use a custom submatrix, use the default one. if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules; // if the strings differ in length, terminate code execution. if (strlen($str1) != strlen($str2)) die("Cannot match sequences with unequal lengths"); $resultstr = ""; $seqlength = strlen($str1); // Match the two strings, character by character. Each call to compare_letter() // function returns a "result character" which is appended to a "result string". for($i = 0; $i < $seqlength; $i++) { $let1 = substr($str1, $i, 1); $let2 = substr($str2, $i, 1); $resultstr = $resultstr . compare_letter($let1, $let2, $matrix, $equal, $partial, $nomatch); } // Assign "result string" to the result property of the calling SeqMatch object. $this->result = $resultstr; // Return the result string. While this line and the line above seems redundant, their // presense here actually permits programmers to write more compact code. return $resultstr; } } ?>

[ Home Page ] [ I/O Scripts Page ]

 


Copyright © 2003 by Sergio Gregorio, Jr.
All rights reserved.