CS 478 Programming Assignment 2 - James Landis

0)) { // parse this section if data has been submitted from the web and the training and test set sizes are valid require("id3.php"); // the file containing the function definitions $MAX_STRLEN = 50; // maximum lenth of attribute names or values $connected = mysql_connect("", "", ""); // connnect to database if ($connected) mysql_select_db("id3"); else { echo "

MySQL Connection Failed!

\n"; echo "The MySQL server denied access to the database. Please try again later.

\n"; echo mysql_error(); exit(); } if (strlen($data_url) > 0) { // get data from external url $data_url = ereg_replace("'","",stripslashes($data_url)); $file = @file($data_url); $data_set = ""; for ($i=0; $i < sizeof($file); $i++) $data_set = $data_set.$file[$i]; } // parse data to be entered into database $data_set = ereg_replace("\r","",$data_set); $attributes = explode("instances",$data_set); $attributes = trim(ereg_replace("\n"," ",ereg_replace("-","_",eregi_replace("attributes","",$attributes[0])))); $attributes = explode(" ",ereg_replace("( )+"," ",strtoupper($attributes))); $data_set = explode("instances",$data_set); $data_set = explode("\n",trim($data_set[1])); // delete old data mysql_query("drop table training"); mysql_query("drop table test"); mysql_query("drop table attributes"); // create new table definitions $query = "(instance varchar($MAX_STRLEN), "; for ($i=0; $i < sizeof($attributes); $i++) $query = $query.$attributes[$i]." varchar(20), "; $query = $query."classification varchar($MAX_STRLEN))"; mysql_query("create table training $query"); mysql_query("create table test $query"); // initialize randomizer srand((double)microtime()*1000000); $training_size = 0; $test_size = 0; $unused = ""; // randomly choose which data set each element should belong to, weighted accordingly // e.g. there is a 10% chance that the element belongs to the training set if training_percentage = 10 // if the element doesn't fall into the training set or test set, save it for later in case the sets are too small for ($i=0; $i < sizeof($data_set); $i++) { $r = rand(0,99); if (($r < $training_percentage) && ($training_size < round(sizeof($data_set) * $training_percentage/100))) { mysql_query("insert into training values('".ereg_replace("( )+","','",trim($data_set[$i]))."')"); $training_size++; } else if (($r < $training_percentage + $test_percentage) && ($test_size < round(sizeof($data_set) * $test_percentage/100))) { mysql_query("insert into test values('".ereg_replace("( )+","','",trim($data_set[$i]))."')"); $test_size++; } else $unused = $unused." $i"; } // if the training set does not have training_percentage percent of the elements, add elements randomly until it does // added elements are removed from the unused set so they are not used more than once, or used in the test set later while (($training_size < round(sizeof($data_set) * $training_percentage/100))) { $temp = explode(" ",trim($unused)); if (sizeof($temp) == 1) $index = 0; else $index = rand(0,sizeof($temp)-1); $index = $temp[$index]; mysql_query("insert into training values('".ereg_replace("( )+","','",trim($data_set[$index]))."')"); $unused = ereg_replace("( )+$index( )+"," ",$unused." "); $training_size++; } if ($training_size > 0) { // if the test set does not have test_percentage percent of the elements, add new elements randomly until it does while (($test_size < round(sizeof($data_set) * $test_percentage/100))) { $temp = explode(" ",trim($unused)); if (sizeof($temp) == 1) $index = 0; else $index = rand(0,sizeof($temp)-1); $index = $temp[$index]; mysql_query("insert into test values('".ereg_replace("( )+","','",trim($data_set[$index]))."')"); $unused = ereg_replace("( )+$index( )+"," ",$unused." "); $test_size++; } // create table of attributes and the number of possible values mysql_query("create table attributes (id varchar($MAX_STRLEN), num_values smallint unsigned)"); for ($i=0; $i < sizeof($attributes); $i++) { $name = $attributes[$i]; $query = "insert into attributes values('$name',".mysql_numrows(mysql_query("select $name from training group by $name")).")"; mysql_query($query); } $max_val = mysql_result(mysql_query("select max(num_values) from attributes"),0,0); // generate id3 tree $dtree = train_id3_dtree($dtree, "root", sizeof($attributes), implode(" ",$attributes), '', $max_val); // generate trace $trace = id3_dtree_trace($dtree, "root", 1); } else $trace = ""; if ($training_size > 0) { // check tree performance on training set $instances = mysql_query("select instance, classification from training"); $correct_training = $training_size; for ($i=0; $i < mysql_numrows($instances); $i++) { $instance = mysql_result($instances, $i, 0); $tree_class = id3_dtree_classify($dtree, "root", "training", $instance); if ($tree_class <> mysql_result($instances, $i, 1)) $correct_training--; } $training_accuracy = bcdiv(100 * $correct_training, $training_size, 3); } else $training_accuracy = 0; if (($training_size > 0) && ($test_size > 0)) { // check tree performance on test set $instances = mysql_query("select instance, classification from test"); $correct_testing = $test_size; for ($i=0; $i < mysql_numrows($instances); $i++) { $instance = mysql_result($instances, $i, 0); $tree_class = id3_dtree_classify($dtree, "root", "test", $instance); if ($tree_class <> mysql_result($instances, $i, 1)) $correct_testing--; } $test_accuracy = bcdiv(100 * $correct_testing, $test_size, 3); } else $test_accuracy = 0; // generate trace file echo "

Decision tree training parameters

\n"; echo "
Data set name       : ";
  if (strlen($data_url) > 0) echo "$data_url";
  else echo "$set_name";
  echo "\n";
  echo "Size of data set    : ".sizeof($data_set)."\n";
  echo "Training percentage : $training_percentage%\n";
  echo "Test percentage     : $test_percentage%
\n"; echo "

Decision tree performance

\n"; $nodes = explode("LEVEL",$trace); echo "
Nodes in tree (including leaves) : ".(sizeof($nodes)-1);
  echo "\nAccuracy on training set         : $training_accuracy% ($training_size instances)\n";
  echo "Accuracy on test set             : $test_accuracy% ($test_size instances)
\n"; echo "

Decision tree trace

\n
$trace
\n"; } else { // parse this section if data has not yet been submitted or data is invalid if (!isset($training_percentage)) { $training_percentage = 50; $test_percentage = 50; } ?>
Data set URL:

OR

Data set name: (Enter data below)

Training percentage: Test percentage: