#!/usr/bin/python """ Second BioCreative challenge Protein Protein Interaction, IAS subtask submission format check script. """ ########################### # IMPORT PYTHON LIBRARIES # ########################### import string, re, os, random, sys from optparse import OptionParser #################### #################### def get_nr_pmidlist(red_pmid_list): nr_list = [] for pmid in red_pmid_list: if pmid not in nr_list: nr_list.append(pmid) return nr_list ########################### # INPUT ERROR CHECK ########################### def capture_input_errors(pred_true,pred_false,pmid_test_file): """Check for missing input arguments """ if pred_true == None: print "Please specify the PPI relevant prediction file!" sys.exit(2) if pred_false == None: print "Please specify the PPI not relevant prediction file!" sys.exit(2) if pmid_test_file == None: print "Please specify the PMID check file!" sys.exit(2) ######################## # GET WHOLE PMID LIST # ######################## def get_pmid_list(pmid_test_file): """GET THE LIST OF PUBMED IDS IN THE DATA COLLECTION """ pmid_test_file = str(pmid_test_file) pmid_test_file = string.strip(pmid_test_file) try: pmid_filename = open(pmid_test_file,'r') except: print "Error could not open file: ", pmid_test_file pmid_list = [] while 1: file_line = pmid_filename.readline() if file_line: if len(file_line) > 0: pmid = str(file_line) pmid = string.strip(pmid) pmid_list.append(pmid) else: break return pmid_list ############################### # GET/CHECK PREDICTION FILES # ############################### def get_pred_list(pred_file): """GET THE PREDICTION RUN FILES """ pred_file = str(pred_file) pred_file = string.strip(pred_file) try: filename = open(pred_file,'r') except: print "Error could not open file: ", pmid_test_file team_list = [] run_id_list = [] task_id_list = [] pred_type_list = [] pred_rank_list = [] pred_pmid_list = [] while 1: file_line = filename.readline() line_counter = 1 if file_line: if len(file_line) > 0: lines = str(file_line) line_split = string.split(lines,'\t') if len(line_split) != 6: print "CHECK SUBMISSION FORMAT OF LINE: ", line_counter print lines print line_split break if len(line_split) == 6: team_id = line_split[0] run_id = line_split[1] task_id = line_split[2] pred_type = line_split[3] pred_rank = line_split[4] pred_pmid = line_split[5] col_cnt = 1 for column in line_split: column = str(column) column = string.strip(column) if len(column) < 1: print "MISSING VALUE FOR ", col_cnt print "LINE NUMBER: ", line_counter print lines print line_split break col_cnt = col_cnt + 1 team_id = str(team_id) team_id = string.strip(team_id) run_id = str(run_id) run_id = string.strip(run_id) task_id = str(task_id) task_id = string.strip(task_id) pred_type = str(pred_type) pred_type = string.strip(pred_type) pred_rank = str(pred_rank) pred_rank = string.strip(pred_rank) pred_pmid = str(pred_pmid) pred_pmid = string.strip(pred_pmid) team_list.append(team_id) run_id_list.append(run_id) task_id_list.append(task_id) pred_type_list.append(pred_type) pred_rank_list.append(pred_rank) pred_pmid_list.append(pred_pmid) line_counter = line_counter + 1 else: break return team_list, run_id_list, task_id_list, pred_type_list, pred_rank_list, pred_pmid_list ########################### # MAIN FUNCTION OF SCRIPT # ########################### if __name__ == '__main__': # Define script input options parser = OptionParser() parser.add_option("--t",default=None,help="Predicted as PPI relevant",dest="pred_true") parser.add_option("--f",default=None,help="Predicted as not PPI relevant",dest="pred_false") parser.add_option("--i",default=None,help="PMID check file",dest="pmid_test_file") # Read the options provided by the user (options, args) = parser.parse_args(sys.argv[1:]) if sys.argv[1:] == []: print "Please specify input options!", print '\n' print "Example input for team 60, run 1:" print "./formatcheck_bc2_ppi_ias_V01.py --t BC2_PPI_IAS_T60_BC2_PPI_1_T --f BC2_PPI_IAS_T60_BC2_PPI_1_F --i bc2_ppi_ias_pmid_test.txt" print '\n', print "Where:" print "--t refers to the entries predicted as PPI relevant" print "--f refers to the entries predicted as not PPI relevant" print "--i refers to the test set PMID check file, i.e. bc2_ppi_ias_pmid_test.txt" print '\n' sys.exit(0) pred_true = options.pred_true pred_false = options.pred_false pmid_test_file = options.pmid_test_file #check input argument capture_input_errors(pred_true,pred_false,pmid_test_file) #get pmid list pmid_list = get_pmid_list(pmid_test_file) #get PPI relevant predicted team_t_list, run_id_t_list, task_id_t_list, pred_type_t_list, pred_rank_t_list, pred_pmid_t_list = get_pred_list(pred_true) #get PPI not relevant predicted team_f_list, run_id_f_list, task_id_f_list, pred_type_f_list, pred_rank_f_list, pred_pmid_f_list = get_pred_list(pred_false) ################### #CHECK TEAM NAMES # ################### nr_t_names = get_nr_pmidlist(team_t_list) nr_f_names = get_nr_pmidlist(team_f_list) if len(nr_t_names) > 1: print "CHECK TEAM NAMES COLUMN IN ", pred_true print nr_t_names if len(nr_f_names) > 1: print "CHECK TEAM NAMES COLUMN IN ", pred_false print nr_f_names ####################### #CHECK PREDICTION RUN # ####################### nr_t_run = get_nr_pmidlist(run_id_t_list) nr_f_run = get_nr_pmidlist(run_id_f_list) if len(nr_t_run) > 1: print "CHECK RUN ID COLUMN IN ", pred_true print nr_t_run if len(nr_f_run) > 1: print "CHECK RUN ID COLUMN IN ", pred_false print nr_f_run ####################### #CHECK TASK ID COLUMN # ####################### nr_t_task = get_nr_pmidlist(task_id_t_list) nr_f_task = get_nr_pmidlist(task_id_f_list) if len(nr_t_task) > 1: print "CHECK TASK ID COLUMN IN ", pred_true print nr_t_task if len(nr_f_task) > 1: print "CHECK TASK ID COLUMN IN ", pred_false print nr_f_task ##################### #CHECK RANK COLUMN # ##################### nr_t_rank = get_nr_pmidlist(pred_rank_t_list) nr_f_rank = get_nr_pmidlist(pred_rank_f_list) if len(nr_t_rank) != len(pred_rank_t_list): print "CHECK RANK COLUMN IN ", pred_true print nr_t_rank if len(nr_f_rank) != len(pred_rank_f_list): print "CHECK RANK COLUMN IN ", pred_false print nr_f_rank ####################### #CHECK CORRECT PMIDS # ####################### for t_pmid in pred_pmid_t_list: if t_pmid not in pmid_list: print "RELEVANT PREDICTED PMID ERROR , NOT IN PMID CHECK FILE: ", t_pmid for f_pmid in pred_pmid_f_list: if f_pmid not in pmid_list: print "RELEVANT PREDICTED PMID ERROR , NOT IN PMID CHECK FILE: ", f_pmid ##################################### # CHECK PMID PREDICTIONS REDUNDANCY # ##################################### nr_t_pmid = get_nr_pmidlist(pred_pmid_t_list) nr_f_pmid = get_nr_pmidlist(pred_pmid_f_list) if len(nr_t_pmid) != len(pred_pmid_t_list): print "CHECK PMID REDUNDANCY IN ", pred_true print nr_t_pmid if len(nr_f_pmid) != len(pred_pmid_f_list): print "CHECK PMID REDUNDANCY IN ", pred_false print nr_f_pmid ################################### # CHECK MULTIPLE PMID PREDICTIONS # ################################### for pmid_true in pred_pmid_t_list: if pmid_true in pred_pmid_f_list: print "PMID PREDICTION ERROR, A GIVEN PMID CAN NOT BE IN BOTH TRUE AND FALSE SET: ", pmid_true ########################## # CHECK PREDICTION START # ########################## if pred_rank_t_list[0] != '1': print "RANK IN PPI RELEVANT PREDICTIONS STARTS WITH 1, NOT WITH: ", pred_rank_t_list[0] if pred_rank_f_list[0] != '1': print "RANK IN PPI NOT RELEVANT PREDICTIONS STARTS WITH 1, NOT WITH: ", pred_rank_f_list[0] ####################### # CHECK MISSING PMIDS # ####################### all_pred = pred_pmid_t_list + pred_pmid_f_list if len(all_pred) != len(pmid_list): print "CHECK FOR MISSING PMID PREDICTIONS, YOU HAVE TO SUBMIT PREDICTIOSN FOR ALL ENTRIES!" total_t = len(pred_pmid_t_list) total_f = len(pred_pmid_f_list) total_entries = len(pmid_list) ############################ # PRINT PREDICTION SUMMARY # ############################ print '\n' print '=======================================================================' print ' PREDICTION SUMMARY OF ', nr_t_names[0] print '=======================================================================' print "FILE CONTAINING THE PPI RELEVANT PREDICTIONS: " print pred_true print "FILE CONTAINING THE PPI NOT RELEVANT PREDICTIONS: " print pred_false print '-----------------------------------------------------------------------' print "TOTAL NUMBER OF ENTRIES IN DATA SET: ", len(pmid_list) print "TOTAL NUMBER OF ENTRIES PREDICTED AS PPI RELEVANT: ", len(pred_pmid_t_list) print "TOTAL NUMBER OF ENTRIES PREDICTED AS PPI RELEVANT: ", len(pred_pmid_f_list) print "PERCENTAGE OF TOTAL SET PREDICTED AS PPI RELEVANT ", float(total_t) / float(total_entries) print "PERCENTAGE OF TOTAL SET PREDICTED AS NOT PPI RELEVANT ", float(total_f) / float(total_entries) print '======================================================================='