## Replace gene_id column with gene_name column in the gtf file for RSEM. ## Also check if any transcript_id defined in multiple chromosomes. my ($fileName) = @ARGV; if (not defined $fileName) { die "File name is required."; } else { print "valid_$fileName and notvalid_$fileName will be created"; } open(OUT1, ">valid_$fileName"); open(OUT2, ">notvalid_$fileName"); my %transcipt; my $file = $fileName; open IN, $file; while( my $line = ) { chomp; @a=split("\t",$line); @attr=split(";",$a[8]); my %h; for my $elem (@attr) { ($first, $rest) = split ' ', $elem, 2; $h{$first} = $rest.";"; } my $geneId = ""; my $transcript_id = ""; if (exists $h{"gene_name"}){ $geneId = $h{"gene_name"}; } elsif (exists $h{"gene_id"}){ $geneId = $h{"gene_id"}; } if (exists $h{"transcript_id"}){ $transcript_id = $h{"transcript_id"}; } elsif (exists $h{"transcript_name"}){ $transcript_id = $h{"transcript_name"}; } elsif (exists $h{"gene_id"}){ $transcript_id = $h{"gene_id"}; } if ($geneId ne "" && $transcript_id ne ""){ ## check if any transcript_id defined in multiple chromosomes. if (exists $transcipt{$transcript_id}){ if ($transcipt{$transcript_id} ne $a[0]){ print OUT2 "$transcript_id: $transcipt{$transcript_id} vs $a[0]\n"; next; } } else { $transcipt{$transcript_id} = $a[0]; } $a[8]=join(" ",("gene_id",$geneId,"transcript_id",$transcript_id)); print OUT1 join("\t",@a), "\n"; } else { print OUT2 "$line"; } } close OUT1; close OUT2; close IN;