[ Home
Page ] [ I/O Scripts Page
]
Source Code Listing of etc.inc.php
(Include file that contains helper functions)
Note: This is part of BioPHP 1.1 alpha code set. The code, which is
approximately 600+ lines long, is still rough. Improvements to the code are welcome!
<?php
/* ETC.INC contains definitions for the SubMatrix and SeqMatch classes.
It also contains helper functions such as is_blank(), isa_qualifier(),
firstchar(), left(), etc. */
// =============================================================
// Newly added functions (version 1.1)
// June 21, 2003: Added the three functions below used when writing data as flatfile.
function linechop($string, $wid, $prefix = "")
{
$result = "";
$i = 0;
while(1)
{
$i++;
$left = substr($string, 0, $wid);
$right = substr($string, $wid);
if ($i == 1) $result .= $left . "\n";
else $result .= $prefix . $left . "\n";
$string = $right;
if (($string == FALSE) or (trim($string) == "") )
{ // remove the last \n from $result
if (right($result,2) == "\n\n") $result = left($result, strlen($result)-1);
break;
}
}
return $result;
}
/* Assume that $phrase is made up of words separated by a single (not multiple)
whitespace character.
VALID: "The quick brown fox"
WRONG: "The quick brown fox"
*/
function right_word($phrase, $no = 1)
{
// split phrase into words or tokens
$ph_words = preg_split("/\s/", $phrase);
if ($no == 1) return (array_pop($ph_words));
}
function remove_word($phrase, $word_no = "last")
{
// split phrase into words or tokens
$ph_words = preg_split("/\s/", $phrase);
if ($word_no == "last")
// remove the last word from $ph_words
$last_word = array_pop($ph_words);
// join the remaining words with a space
$new_phrase = implode(" ", $ph_words);
return $new_phrase;
}
function linechopw($string, $wid, $prefix = "")
{
$result = "";
$i = 0;
while(1)
{
$i++;
$left = substr($string, 0, $wid);
$right = substr($string, $wid);
if ( ($right) and (right($left,1) != " ") and (left($right,1) != " ") )
{ // we're splitting a whole word
$last_word = right_word($left);
$left = remove_word($left);
$right = ltrim($last_word . $right);
}
else $right = ltrim($right);
if ($i == 1) $result .= $left . "\n";
else $result .= $prefix . $left . "\n";
$string = $right;
if ( ($string == FALSE) or (trim($string) == "") ) break;
}
return $result;
}
function multientry_pad($array, $padchar, $padwid, $padopt, $entries_per_line, $prefix = "")
{
$result = "";
$entry_ctr = 0;
$line_ctr = 0;
$line_change = FALSE;
foreach($array as $entry)
{
$entry_ctr++;
if ($line_change)
{
$result .= $prefix;
$line_change = FALSE;
}
$result .= str_pad(trim($entry), $padwid, $padchar, $padopt);
if ($entry_ctr == $entries_per_line)
{
$result .= "\n";
$entry_ctr = 0;
$line_change = TRUE;
}
}
return $result;
}
/* ASSUMPTIONS: By its name, this function is fixed to DBGET of course. But
the parameters (DBNAME and DBID) can be set by the user. I've put the
address (http://www.genome.ad.jp...) inside dbget(). It can be placed
outside as well.
*/
function dbget($dbname, $dbid, $temp_file)
{
// define constants for DBGET system
$dbget_url = "http://www.genome.ad.jp/dbget-bin/www_bget";
// debugger echo code
/*
print "$dbget_url?$dbname+$dbid";
print "<BR>";
print $temp_file;
die("STOP MUNA");
*/
// This block does the work of retrieving data from $dbname via DBGET and saving
// the results as a file named $temp_file in the folder $temp_dir.
$ch = curl_init("$dbget_url?$dbname+$dbid");
$fp = fopen($temp_file, "w+");
if ($fp == FALSE) die("Error in opening temporary file. Script aborted.");
curl_setopt ($ch, CURLOPT_FILE, $fp);
curl_setopt ($ch, CURLOPT_HEADER, 0);
$ok = curl_exec ($ch);
if ($ok == FALSE) die("CURL error: Script aborted!");
curl_close ($ch);
fclose($fp);
return TRUE;
}
function dbgetfile2r($temp_file)
{
// This stores entire DBGET temp file (HTML) into one string, removes HTML tags...
$flines = file($temp_file);
$flinestr = implode(" ", $flines);
$flinestr = strip_tags($flinestr);
// ...and writes the result into a second file, with the *.notags extension.
$file2 = $temp_file . ".notags";
$fp2 = fopen($file2, "w+");
fputs($fp2, $flinestr);
fclose($fp2);
$flines = file($file2);
$new_flines = array();
$line_ctr = 0;
// This loop searches for the line containing the ID data field.
foreach($flines as $linestr)
{
if ( ($posit = strpos($linestr, "ID ")) > 0 )
{
$new_flines[] = trim(substr($linestr, $posit));
$start_line = $line_ctr;
break;
}
$line_ctr++;
}
// remove "junk data/lines" from the $flines array
for($i = 0; $i <= $start_line; $i++)
array_shift($flines);
reset($flines);
// start loading data into $flines array
foreach($flines as $linestr)
{
$linestr = ltrim($linestr);
$new_flines[] = preg_replace("/\n/", '', $linestr);
if (trim($linestr) == "//") break;
}
return ($new_flines);
}
// file_verb is on EXPERIMENTAL STATUS. May be removed later.
function file_verb($filetype, $operation)
{
switch($filetype)
{
case "FILE": return ($operation == "R" ? "parse" : "cat");
case "VAR": return ($operation == "R" ? "eval" : "assign");
case "SQL": return ($operation == "R" ? "select" : "insert");
case "STDIN": return "input";
case "STDOUT": return "output";
}
}
function get_parsefunc($object, $format)
{
$object = strtolower($object);
$format = strtolower(get_shortformat($format));
return "parse_" . $object . "_" . $format;
}
function get_shortformat($format)
{
$format = strtoupper($format);
switch($format)
{
case "GENBANK": return "GB";
case "SWISSPROT": return "SWP";
default: return $format;
}
}
function insert_tagpair($line, $opentag, $closetag, $begcol, $endcol)
{
$begstrpos = col2strpos($line, $begcol);
$endstrpos = col2strpos($line, $endcol);
$left = substr($line, 0, $begstrpos);
$mid = substr($line, $begstrpos, $endstrpos-$begstrpos+1);
$right = substr($line, $endstrpos+1);
return ($left . $opentag . $mid . $closetag . $right);
}
function col2strpos($seq, $col)
{
$strip = strip_tags($seq);
$strpos = array();
$opentag = FALSE;
$closetag = FALSE;
for($i = 0; $i < strlen($seq); $i++)
{
$sym = substr($seq, $i, 1);
if ($sym == "<")
{
$intag = TRUE;
continue;
}
if ($sym == ">")
{
$intag = FALSE;
continue;
}
if ($intag) continue;
$strpos[] = $i;
}
return $strpos[$col];
}
// returns an array containing the line number column no.
function pos2rc($pos)
{
$rowcol = array();
$row = ((int) ($pos / 60));
$rowcol[] = $row;
$offset = $pos - (($row) * 60);
$decade = ((int) ($offset / 10));
$col = $offset + $decade + 10;
$rowcol[] = $col;
return $rowcol;
}
// =============================================================
$patterndb = array("_StartCodon" => "AUG", "_EndCodon" => "[UAA,UAG,UGA]");
// trim_element() removes leading and trailing spaces from a string. In conjunction
// with the array_walk() function, it removes spaces from each element of an array.
function trim_element(&$value, $key)
{
$value = trim($value);
}
// June 10, 2003: Added this function which adds a slash for special chars to each
// element in an array.
function addslashes_element(&$value, $key)
{
$value = addslashes($value);
}
function stripslashes_element(&$value, $key)
{
$value = stripslashes($value);
}
/*
is_false() tests if a value is a boolean false and not a zero (0). This is necessary
to correctly interpret the return value of some PHP functions like strpos(). strpos()
returns a zero (0) if a string is found at the beginning of a larger string, and FALSE
if it cannot find that string within the larger string. In PHP, FALSE equals 0.
*/
function is_false($value)
{
if ( (gettype($value) == "boolean") and
($value == FALSE) ) return TRUE;
else return FALSE;
}
// rem_right() removes $charcount characters from the right (end) of a string.
function rem_right($str, $charcount = 1)
{
return substr($str, 0, strlen($str)-$charcount);
}
// intrim() removes "internal spaces" (as opposed to leading and trailing spaces) from a string.
function intrim($string)
{
return eregi_replace(' ', '', $string);
}
// getmin() gets the minimum of three (usually numeric) values $x, $y, and $z.
// For now, this can't handle situations when one or more arguments is FALSE.
function getmin($x, $y, $z)
{
if ($x < $y)
if ($x < $z) return $x;
else return $z;
else
if ($y < $z) return $y;
else return $z;
}
// is_even() tests if an integer is an even number.
function is_even($integer)
{
if (($integer/2) == ((int) ($integer/2))) return TRUE;
else return FALSE;
}
// is_odd() tests if an integer is an odd number. This is the opposite of is_even().
function is_odd($integer)
{
if (($integer/2) != ((int) ($integer/2))) return TRUE;
else return FALSE;
}
// is_blankstr() tests if a value is a blank string (""). Like is_false(), this
// helps interpret the value of some PHP functions or expressions.
function is_blankstr($var)
{
if ( (gettype($var) == "string") and ($var == "") ) return TRUE;
else return FALSE;
}
function is_notmt($var)
{
return ( !(is_blankstr($var)) );
}
// I think this function should give way to or be replaced by is_blankstr().
// I haven't removed this yet as I have to check if some code still uses it.
function is_blank($str)
{
if ($str == "") return true;
else return false;
}
// firstchar() returns the first or beginning character of a string.
function firstchar($str)
{
return left($str, 1);
}
// left() returns the first $numchars characters of a string.
function left($str, $numchars)
{
return substr($str, 0, $numchars);
}
// right() returns the substring beginning at $numchars characters from the right end of a string.
function right($str, $numchars)
{
return substr($str, strlen($str)-$numchars);
}
// compare_letter() compares two letters $let1 and $let2 and returns another letter
// indicating if the two were exact matches, partial matches, or non-matches.
function compare_letter($let1, $let2, $matrix, $equal, $partial = "+", $nomatch = ".")
{
global $chemgrp_matrix;
// if no custom substitution matrix was provided, use the default.
if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules;
// if no symbol for exact matches was provided, use the residue symbol.
if (isset($equal) == FALSE) $equal = $let1;
if ($let1 == $let2) return $equal;
elseif (partial_match($let1, $let2, $matrix)) return $partial;
else return $nomatch;
}
/* Algorithm:
We abbreviate substitution matrix to "submatrix". Each element in a submatrix is an array of
symbols that are considered "partial matches" of each other.
Default submatrix:
( ('G','A','V','L','I'), ('S','T'), ('N','Q'), ('F','Y','W'), ('C', 'M'), ('P'), ('D','E'), ('K','R','H'),
('*'), ('X') )
1) Check if both $let1 and $let2 appear in the first element (G,A,V,L,I) of the substitution matrix.
2) If they are, you've found a "hit", and $let1 and $let2 are partial matches. Return a TRUE value.
If they are not, then go to the next element in the substitution matrix.
Repeat steps 1 and 2 until you reach a submatrix element where both $let1 and $let2 appear, or
until the last element in the submatrix has been checked.
3) If you reach the last submatrix element without a "hit", return a FALSE value.
NOTE: This will not warn if you $let1 and/or $let2 is nowhere to be found in the whole submatrix.
*/
function partial_match($let1, $let2, $matrix)
{
global $chemgrp_matrix;
if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules;
foreach($matrix as $rule)
if ((in_array($let1, $rule)) and (in_array($let2, $rule))) return TRUE;
return FALSE;
}
// getpattern() retrieves the pattern string from the pattern database ($patternDB array).
function getpattern($pattern)
{
global $patterndb;
return $patterndb[$pattern];
}
// This class allows the use of customized substitution matrices. See tech doc for details.
class submatrix
{
var $rules;
// submatrix simply initializes the rules property to the empty array.
function submatrix()
{
$this->rules = array();
}
// addrule() adds a rule to the substitution matrix.
function addrule($x)
{
$x = func_get_args();
// if (isset($this->rules) == FALSE) $this->rules = array();
array_push($this->rules, $x);
}
}
class SeqMatch
{
var $result;
var $hamdist;
var $levdist;
// hamdist() computes the Hamming Distance between two strings or Seq objects
// of equal length. For more information, consult the technical reference.
function hamdist($seq1, $seq2)
{
// If $seq1 is a Seq object, we use its sequence property to compute Hamming Distance.
if (gettype($seq1) == "object") $string1 = $seq1->sequence;
elseif (gettype($seq1) == "string") $string1 = $seq1;
// If $seq2 is a Seq object, we use its sequence property to compute Hamming Distance.
if (gettype($seq2) == "object") $string2 = $seq2->sequence;
elseif (gettype($seq2) == "string") $string2 = $seq2;
// We terminate code execution if the two strings differ in length.
if (strlen($string1) != strlen($string2))
die("Both sequence must be of the same length!");
$len = strlen($string1);
// Initialize the hamming distance to 0 (no difference between two strings).
$distance = 0;
// Match the two strings, character by character. If they are NOT
// identical, increment $distance by 1.
for($i = 0; $i < $len; $i++)
{
$let1 = substr($string1, $i, 1);
$let2 = substr($string2, $i, 1);
if ($let1 != $let2) $distance++;
}
return $distance;
}
// levdist() computes the Levenshtein Distance between two strings or Seq objects
// with equal/unequal lengths. You can pass custom values for cost of insertion,
// replacement, and deletion. If you don't pass any, they are assumed to be 1.
// For more information, see technical reference.
function levdist($seq1, $seq2, $cost_ins = 1, $cost_rep = 1, $cost_del = 1)
{
// If $seq1 is a Seq object, we use its sequence property to compute Levenshtein Distance.
if (gettype($seq1) == "object") $string1 = $seq1->sequence;
elseif (gettype($seq1) == "string") $string1 = $seq1;
// If $seq2 is a Seq object, we use its sequence property to compute Levenshtein Distance.
if (gettype($seq2) == "object") $string2 = $seq2->sequence;
elseif (gettype($seq2) == "string") $string2 = $seq2;
// Check the lengths of the two strings. If they exceed 255 characters, terminate code.
if (strlen($string1) > 255) die("String length must not exceed 255 characters!");
if (strlen($string2) > 255) die("String length must not exceed 255 characters!");
// Compute and return the Levenshtein Distance using PHP's built-in levenshtein() function.
return levenshtein($string1, $string2, $cost_ins, $cost_rep, $cost_del);
}
// xlevdist() is an extended version of levdist() which accepts strings with length
// greater than 255 but not to exceed 1024 (which takes my CPU 18 seconds to compute).
// The only drawback to xlevdist is that the cost of insertion, deletion, and replacement
// is fixed to 1. I have yet to find a way to allow custom values for these.
function xlevdist($s, $t)
{
$n = strlen($s);
$m = strlen($t);
if (($n > 1024) or ($m > 1024)) die("String length must not exceed 1024 characters");
// initialize the array
$values = array();
$temp = array();
$temp[0] = 0;
for($j = 1; $j <= $m; $j++)
$temp[$j] = 0;
$values[0] = $temp;
for($i = 1; $i <= $n; $i++)
$values[$i] = $temp;
for($i = 1; $i <= $n; $i++)
{ // OPENS for($i = 1; $i <= $n; $i++)
$lets = substr($s, $i-1, 1);
for($j = 1; $j <= $m; $j++)
{ // OPENS for($j = 1; $j <= $m; $j++)
$lett = substr($t, $j-1, 1);
if ($lets == $lett) $cost = 0;
else $cost = 1;
// "normal" values of $up, $left, and $upleft
if ($j > 1) $up = $values[$i][$j-1];
else $up = FALSE;
if ($i > 1) $left = $values[$i-1][$j];
else $left = FALSE;
if (($i > 1) and ($j > 1)) $upleft = $values[$i-1][$j-1];
else $upleft = FALSE;
if ($i == 1)
{
if ($j == 1) $value = $cost;
elseif ($cost == 0) $value = $cost;
else $value = $up + 1;
}
else
{
// if at the first or topmost row, there is no upleft and above.
if ($j == 1)
{
if ($cost == 0) $value = $cost;
else $value = $left + 1;
}
else $value = getmin($up + 1, $left + 1, $upleft + $cost);
}
$values[$i][$j] = $value;
} // CLOSES for($j = 1; $j <= $m; $j++)
} // CLOSES for($i = 1; $i <= $n; $i++)
return $values[$n][$m];
} // closes function xlevdist()
/*
The match() method accepts two sequence strings (not objects) of equal length,
and returns a sequence match result string, according to the following rules:
If there is an exact match, return the amino acid symbol.
If there is a partial match, return a plus sign.
If there is no match, return a whitespace character.
*/
function match($str1, $str2, $matrix, $equal, $partial = "+", $nomatch = ".")
{
global $chemgrp_matrix;
// if the user chose not to use a custom submatrix, use the default one.
if (isset($matrix) == FALSE) $matrix = $chemgrp_matrix->rules;
// if the strings differ in length, terminate code execution.
if (strlen($str1) != strlen($str2))
die("Cannot match sequences with unequal lengths");
$resultstr = "";
$seqlength = strlen($str1);
// Match the two strings, character by character. Each call to compare_letter()
// function returns a "result character" which is appended to a "result string".
for($i = 0; $i < $seqlength; $i++)
{
$let1 = substr($str1, $i, 1);
$let2 = substr($str2, $i, 1);
$resultstr = $resultstr . compare_letter($let1, $let2, $matrix, $equal, $partial, $nomatch);
}
// Assign "result string" to the result property of the calling SeqMatch object.
$this->result = $resultstr;
// Return the result string. While this line and the line above seems redundant, their
// presense here actually permits programmers to write more compact code.
return $resultstr;
}
}
?>
|