# mean 1.24 by Boris New (http://www.borisnew.org)

# Vérifier le nb de "o"
# Vérifier ecartypep
# Vérifier 1211

# perl mean.pl data.txt >output.txt

# The number in the first column should be different for each line!!

# It's done to be used with the output of collapse
# It calculates the mean of the line, the standard deviation and print the conditions and then the mean and replaces the outliers by "o" (in nameofthefile_outliers.txt)
# Then you can use these means in SPSS or Stastica for instance for calculating your anovas
# An analysis could be:
# perl azk2txt.pl LexicalDecision.azk | cut -f1,4,5,6  | collapse | perl mean.pl > Outputmean.txt
$help = <<DOC;

pgrep 1.24 (2000) by Boris New (http://www.borisnew.org)

Use: perl mean.pl [-ls] [-li] [-b] inputfile.txt >outputfile.txt
Example:perl mean.pl -ls 1500 -b inputfile.txt >outputfile.txt

This program computes mean without outliers

-ls LIMIT SUP: indicates the maximum absolute limit
-li LIMIT INF: indicates the minimum absolute limit
-b :indicates if you want limits and standard deviation criterion
If you don't define these options, it considers that you want to exclude items more than two standard deviations

DOC

if ($ARGV[0] eq "") {print $help;exit;}
use Getopt::Long;

GetOptions(	"ls=i"=>\$ls,
	"li=i"=>\$li,
	"b"=>\$both);
	   
$temp=$ARGV[0];
$temp=~ s/\..+//g;
$file=$temp."_outliers.txt";
$file2=$temp."_outliers_abstracts.txt";
open(OUTPUT,">$file");
#open(ABSTRACT,">$file2");
while(<>){
	if ($_ !~ /^[A-Za-z0-9éèêâ]/){next;} 
	chomp;
	# It stocks $F[1] to $F[n] the numbers ($F[0] has the factors)
	@F=split(/ /);
	# $K[0] and @Items stock the item number
	@K=split(/\t/);
	# Stocke le numéro uniquement et pas les facteurs
	push @Items, $K[0];
	#$F[0]=~s/\t//g;
	#$NomItem=$K[0];
	$NomItem++;
	push @TousItems, $NomItem;
	# First Mean for removing outliers
	for ($i=1;$i<@F;$i++){
		$nbdonnees[$NomItem]++;
		if ($F[$i] eq "e"){$nberreurs[$NomItem]++;}
		if ((($li ne "")||($both == 1))&&($F[$i] =~ /^[0-9]/)){
			if ($F[$i] < $li){$F[$i] = "o";$nboutliers++;$nboutliers[$NomItem]++;}}
		if ((($ls ne "")||($both == 1))&&($F[$i] =~ /^[0-9]/)){
			if ($F[$i] > $ls){$F[$i] = "o";$nboutliers++;$nboutliers[$NomItem]++;}}
		if ($F[$i] =~ /^[0-9]/){	
		$som=$som+$F[$i];
		$nb++;
		$nbdata++;
		}
	}
	$mean[$NomItem]=$som/$nb;
	# Standard Deviation
	for ($i=1;$i<@F;$i++){
		if ($F[$i] =~ /^[0-9]/){
		$cal1+=($F[$i]-$mean[$NomItem])**2;}
	}
	$var=$cal1/($nb);
	$et[$NomItem]=sqrt($var);
	$lim_sup=$mean[$NomItem]+(2*$et[$NomItem]);
	$lim_inf=$mean[$NomItem]-(2*$et[$NomItem]);
	for ($i=1;$i<@F;$i++){
		if ((($ls eq "") && ($li eq "")) || ($both == 1)) {
			if ((($F[$i]>$lim_sup)||($F[$i]<$lim_inf))&&($F[$i] =~ /^[0-9]/)){
			$F[$i]="o";$nboutliers++;$nboutliers[$NomItem]++;}
		}

	}
	# A debug output
	 print OUTPUT "@F\n";
	# Second mean for the stats computations (mean without outliers)
	$nb2[$NomItem]=0;
	for ($i=1;$i<@F;$i++){
		if ($F[$i] =~ /^[0-9]/){
		$som2=$som2+$F[$i];
		$nb2[$NomItem]++;
		}
	}
	# if ($nb2 == 0){print "Votre condition ligne $. est vide selon le critère choisi";exit}
	$mean2[$NomItem]=$som2/$nb2[$NomItem];
	# Second SD for the stats computations (SD without outliers)
	for ($i=1;$i<@F;$i++){
		if ($F[$i] =~ /^[0-9]/){
		$cal2+=($F[$i]-$mean2[$NomItem])**2;}
	}
	$var2=$cal2/($nb2[$NomItem]);
	$et2[$NomItem]=sqrt($var2);
	
	
# # You can change here the output format
printf "%s\t%.0f\n",$F[0],$mean2[$NomItem];
undef $nbdonnees;
undef $cal1;
undef $cal2;
undef $mean;
undef $mean2;
undef $som;
undef $som2;
undef $nb;
undef $nb2;
undef $som2;
undef $nb2;
undef $var;
undef $var2;
}
$pourcent=$nboutliers/$nbdata * 100;
printf OUTPUT "%s %s %.2f%\n",$nboutliers,$nbdata,$pourcent;
$nbdata=@F;
print OUTPUT "$nbdata données par lignes\n";
print OUTPUT "ItemNumber\tFirstMean\tFirstSD\tSecondMean\tSecondSD\tNbErr\tNbOut\tNbData\t%Err\n";
#for ($i=0;$i<@Items;$i++){
for ($i=0;$i<@TousItems;$i++){
	if ($nberreurs[$TousItems[$i]]==""){$nberreurs[$TousItems[$i]]=0;}
	if ($nboutliers[$TousItems[$i]]==""){$nboutliers[$TousItems[$i]]=0;}
	printf OUTPUT "%s\t%.2f\t%.2f\t%.2f\t%.2f\t%s\t%.0f\t%.0f\t%.0f\n",$Items[$i], $mean[$TousItems[$i]], $et[$TousItems[$i]], $mean2[$TousItems[$i]], $et2[$TousItems[$i]], $nberreurs[$TousItems[$i]], $nboutliers[$TousItems[$i]],$nbdonnees[$TousItems[$i]],($nberreurs[$TousItems[$i]]/$nbdonnees[$TousItems[$i]])*100,$mean[$TousItems[$i]],$et[$TousItems[$i]];
}
__END__

1.02 Comments
1.03 Compute the outliers
1.031 Add uppercases to line 12
1.04 Writes who are the outliers in nameofthefile_outliers.txt
1.05 Translated in english
1.06 Absolute limits
1.1 Counter + absolute limits AND standard deviation
1.11 Correction d'un bug qd -ls et -b
1.22 Ajout d'un résumé à la fin de nomfichier_outliers.txt
1.23 Ajout de premier (avec outliers) et deuxième (sans outliers) mean et sd dans fichier résumé
1.24 Aide plus détaillée avec exemples
1.25 Sortie plus lisible pour le résumé


Help
perl mean.pl sjtemp.txt >sjtemp2.txt
Mean take an input file (here "sjtemp.txt") organised like that
00111	1	1 505.5 506.56 800 508.68 490 750 502 480 500
00112	1	2 375.88 400.57 412.54 410.42 402.6 403.9 50
where the first columns (separated by tabs) are what you want (for example your conditions)
and the columns separated by spaces are the reaction times 

and output two files: A first (here "sjtemp2.txt")like that
00111	1	1	530
00112	1	2	401

where "o" is an outlier
and another file ("sjtemp_outliers.txt") like that
00111	1	1 505.5 506.56 o 508.68 490 750 502 480 500
00112	1	2 375.88 400.57 412.54 410.42 402.6 403.9 o
2 16 12.50%
8 données par lignes
ItemNumber	FirstMean	FirstSD	SecondMean	SecondSD	NbErr	NbOut	NbData	%Err
00111	560.30	115.67	530.3425	83.5069514696232	0	1	9	0
00112	350.84	123.32	400.985	12.0005274189651	0	1	7	0