#!/usr/bin/perl #MODIFIED to accomodate known data type for input column fields # #INPUT: a tab separated file #ARGUMENTS: -c columns names are the first line of the file # -t [table_name] name of table #if no arguments, then the default table name is the "temp_table" and column names are col1, col2, col3, ... # ARGUMENT HANDLER # $#ARGV is the index number for the last argument # array indexes start at 0 $num_args = $#ARGV + 1; #die if no arguments if ($num_args == 0) { print "\ndescription:\tCreates a MySQL script that will 1) create a table\n"; print " \t\t\t\t\t\t 2) load the data in [filename] into the table\n"; print " \t\t\t\t\t\t 3) update entries with NULL values to the MySQL NULL value\n\n"; print "usage:\n"; print "\t\t make_load_sql_script [filename]\t\t\t columns are labeled col1,col2,col3,etc\n"; print "\t\t\t\t\t\t\t\t\t table name is same as filename (without extenstion)\n"; print "\t\t\t\t\t\t\t\t\t NULL value is \"NULL\"\n"; print "\t\t make_load_sql_script -c [filename]\t\t\t columns labels are read from the first line of [filename]\n"; print "\t\t make_load_sql_script -cf [column_filename] [filename]\t columns labels are read in order from [column_filename]\n"; print "\t\t make_load_sql_script -t [tablename] [filename]\t\t table name [tablename]\n"; print "\t\t make_load_sql_script -n [nullvalue] [filename]\t\t NULL value is [nullvalue], should be enclosed in \" quotes, like \"\?\"\n\n"; die; } #die if too many arguments die print "Too many arguments scare and confuse me!\n" unless ($num_args < 9); $file_count = 0; #loop over each argument ARG: foreach $arg_count (0 .. $num_args-1) { #use -c option? if ($ARGV[$arg_count] eq "-c") { $col_in_file = "true" and next ARG } #use -t option? elsif ($ARGV[$arg_count] eq "-t") { #die if there is more than one -t option if ($table_given) { print "Multiple -t arguments, bad!\n" and die } else { #get table name if using -t option $table_given = "true"; $table_name = $ARGV[$arg_count+1] unless ($arg_count+1 > $#ARGV); next ARG } } elsif ($ARGV[$arg_count] eq "-cf") { #die if there is more than one -cf option if ($colnam_given) { print "Multiple -cf arguments, bad!\n" and die } else { #get column file name if using -cf option $colnam_given = "true"; $colnam_filename = $ARGV[$arg_count+1] unless ($arg_count+1 > $#ARGV); next ARG } } elsif ($ARGV[$arg_count] eq "-n") { #die if there is more than one -n option if ($null_given) { print "Multiple -n arguments, bad!\n" and die } else { #get NULL designation if using -n option $null_given = "true"; $null_value = $ARGV[$arg_count+1] unless ($arg_count+1 > $#ARGV); next ARG } } elsif ((($ARGV[$arg_count-1] ne "-t") && ($ARGV[$arg_count-1] ne "-n") && ($ARGV[$arg_count-1] ne "-cf")) || ($arg_count == 0)) { #get file name $file_count++; $file_name = $ARGV[$arg_count]; } } #die if there is no filename print "I need to know atleast what data file to analyze!\n" and die unless ($file_name); #die if there is no obvious filename print "I can not figure out which argument is the filename!\n" and die unless ($file_count == 1); #die if there is no tablename if -t option was specified print "I need to know the table name!\n" and die unless (($table_name) || !($table_given)); #die if there is no null name if -n option was specified print "I need to know the NULL designation\n" and die unless (($null_value) || !($null_given)); #die if there is no column file name if -cf option was specified print "I need to know the filename that contains the column names!\n" and die unless (($colnam_filename) || !($colnam_given)); #die if both column naming options are chosen print "Either -c or -cf can be specified, but *not* both!\n" and die if ($colnam_given && $col_in_file); # ARGUMENT HANDLER DIAGNOSTICS #print "number of arguments = \t$num_args\n"; #$count = 0; #foreach $argument (@ARGV) # { # print "$count\t$argument\n"; # $count++; # } #print "column headings in file? = \t $col_in_file\n" if ($col_in_file); #print "table name = \t $table_name\n" if ($table_name); #print "column file name = \t $colnam_filename\n" if ($colnam_filename); #print "file name = \t $file_name\n" if ($file_name); #die; #Determinig the maximum size of each column and reading column headings if specified open(DATA,"<$file_name") or print "$file_name: File not found!\n" and die; $line_count = 0; LINE: while ($line = ) { # remove, if present, the "new line" character at the end of the line chomp($line); $line_count++; @this_row = split /\t/, $line; # if this is the first line of the file: # 1) check to see if it should be used set the column labels # 2) intialize the maximum column size array # if this is not the first line of the file: # 1) update the maximum column size array # if the column names are from a separate file, then it will read the column names from that file # if there are too many names in the column file, then the excess will be skipped # if there are too few names in the column file, then the extra columns will be given generic names if ($line_count == 1) { $num_cols = $#this_row+1; if ($col_in_file) { @column_labels = split /\t/, $line; @max_col_size = (0) x $num_cols; } elsif ($colnam_given) { open(COLFILE,"<$colnam_filename") or print "$colnam_filename: File not found!\n" and die; $col_count = 0; while ($line2 = ) { chomp($line2); $column_labels[$col_count] = $line2; $max_col_size[$col_count] = length($this_row[$col_count]); $col_count++; } if ($col_count < $num_cols) { while($col_count < $num_cols) { $column_labels[$col_count] = "col$col_count"; $max_col_size[$col_count] = length($this_row[$col_count]); $col_count++; } } close(COLFILE); } else { foreach $col_count (0 .. ($num_cols-1)) { $column_labels[$col_count] = "col$col_count"; $max_col_size[$col_count] = length($this_row[$col_count]); } } } else { foreach $col_count (0 .. ($num_cols-1)) { if ($max_col_size[$col_count] < length($this_row[$col_count])) {$max_col_size[$col_count] = length($this_row[$col_count]) } } } } #create the script #sets the table name to the file name upto the ".", for example: file_name = "poop.crap" -> then table_name = "poop" @file_name_split = split /\./, $file_name; $table_name = $file_name_split[0] unless ($table_name); print "CREATE TABLE $table_name\n"; print " (\n"; foreach $col_count (0 .. ($num_cols-1)) { #add 1 to each maximum column size just to be safe #if length > 255 then make it a blob $size = $max_col_size[$col_count]+1; if ($size < 254) { print "\t\`$column_labels[$col_count]\`"; if ($column_labels[$col_count] =~ /_count/) { print " SMALLINT(4) UNSIGNED" } elsif ($column_labels[$col_count] =~ /run_number/) { print " SMALLINT" } elsif ($column_labels[$col_count] =~ /ps[12345678]/) { print " VARCHAR($size)" } else { print " FLOAT(16,9)" } } else { print "\t\`$column_labels[$col_count]\` BLOB" } print ",\n"; } print " INDEX (run_number)\n"; print " );\n"; print "LOAD DATA LOCAL INFILE \'$file_name\' INTO TABLE $table_name;\n"; #update the table to reflect NULL designation $null_value = "NULL" unless ($null_value); foreach $col_count (0 .. ($num_cols-1)) { print "UPDATE $table_name SET \`$column_labels[$col_count]\` = NULL WHERE \`$column_labels[$col_count]\` = \"$null_value\";\n"; }