#! /usr/bin/env python

import random
from collections import defaultdict



# Simple program that selects a subset of the trainning set for active learning, as using the entire set would be too slow. Prints out the statistics of the full set and the subset, to check it hasn't done anything too drastic.

# Parameters...
count = 10000



# Function that calculates and prints the statistics...
def print_stats(lines):
  stats = defaultdict(int)
  for line in lines:
    c = int(line.split()[-1])
    stats[c] += 1

  for cat, count in stats.iteritems():
    print 'cat %i has %i instances (%.2f%%)'%(cat,count,100.0*float(count)/float(len(lines)))



# Read in the data...
orig = open('shuttle.trn','r').readlines()

# Output its stats...
print 'Original'
print_stats(orig)
print



# Shuffle and select the relevant set...
random.shuffle(orig)
subset = orig[:count]

# Stats, again...
print 'Selected subset:'
print_stats(subset)



# Save it...
f = open('shuttle_subset.trn','w')
for line in subset: f.write(line)
f.close()
