'''data1.py Jed Yang, 2016-11-12 Parse the downloaded gene-data.txt file in a specified way. File format: One gene entry per line. Each entry has several values, separated by the character, represented as '\t' in Python. The first line, started with a '#', is a header line contains the name of the fields. A missing field is indicated by a '-'. Task request: My biologist friend asks me for a list of gene SYMBOL (6th field), one a line, with duplicates removed. Let us use getUniqueElements() to remove duplicates. Of course, that means we need to sort the results first. Here we go! ''' def main(): ############################################################################# # Step 1. Read and process the file. The processInput() function (below) # is similar to the construct in our weatherstats.py homework. Instead of # numbers, our gene file has complicated data. So to abstract this away, I # decided to make a class called Entry to keep track of each entry in my # file. See below. filename = 'gene-data.txt' entries = processInput(filename) print('Processed', len(entries), 'entries.') ############################################################################# # Step 2. Sort. In the next step, we are going to use getUniqueElements(). # Recall that it requires that the list is sorted. How do we do that? # Since entries is a list, it has a .sort() method. Let's try that: entries.sort() # Oops. We get this error message: # TypeError: unorderable types: Entry() < Entry() # What this means is that the list contains our Entry objects, which Python # doesn't know how to sort. We can certainly write our own sort. But is # there a way to teach Python how to sort our Entry objects? When we # learned sorting algorithms, we seem to just need to *COMPARE* pairs of # things to know which one is *LESS THAN* the other. So for Python to sort # things for us, it just needs a way to know if # Entry object 1 < Entry object 2 # Fortunately, we can write a __lt__ (``lt'' stands for ``less than'') # method in the Entry class definition. Like so: def __lt__(self, other): return self.geneSymbol < other.geneSymbol # It assumes 'other' is another Entry object, and it returns the comparison # of the relevant data, which is this object's geneSymbol and that other # object's geneSymbol. # # Cut-and-paste this method into Entry class. # # Now Python knows how to *compare* pairs of Entry objects, it can therefore # use its sorting algorithm on our Entry objects! ############################################################################# # Step 3. Remove duplicates. unique = getUniqueElements(entries) # Hmm, the error message is gone, but 5946 processed elements are all # written to the output file. It seems like getUniqueElements is failing to # remove duplicates. The problem is that when it compares two Entry objects # with the same geneSymbol, it sees them as two separate Entry objects. # (This is correct behaviour, since they *ARE* different objects.) But we # actually want to consider them the same. The problem is similarly solved # if we teach Python how to test for *EQUALITY* of two Entry objects, using # the __eq__ method (``eq'' stands for ``equals''). def __eq__(self, other): return other and (self.geneSymbol == other.geneSymbol) # The ``other and'' construct will return False (short-circuit) if 'other' # is None, which is the case in the first iteration of the loop in # getUniqueElements(). # # Cut-and-paste this method into Entry class. # # Now Python can test equality properly. Everything should work: there # should be 661 lines written to file instead of all 5946 entries. ############################################################################# # Step 4. Write the output to file. If you need to save data to a file in # your final project, or in the future, you can look at this example. =) writeOutput('out.txt', unique) ################################################################################ # Entry object ############################################################## ################################################################################ # Each Entry object is initialized with a line of the gene data file. My # friend cares about the 'SYMBOL' field in the tab-delineated file. So let us # track that. class Entry: def __init__(self, line): '''Takes a single line from a file, parses, and stores relevant info.''' array = line.split('\t') # split line by , represented as '\t' self.geneSymbol = array[5] # save the 6th field, which is 'SYMBOL' # You have seen __str__ before. If I print an Entry corresponding to a line # in my gene data, I just want the gene symbol, since that's what my friend # cares about. def __str__(self): return self.geneSymbol # Much like what we had in weatherstats.py homework. def processInput(filename): '''Return a list of Entry objects processed from the file specified.''' infile = open(filename, 'r') # 'r' stands for reading, can be omitted entries = [] for line in infile: if line[0] == '#': # what are we doing here? continue entry = Entry(line) # We store each line as an Entry object. See above. entries.append(entry) infile.close() return entries # You already saw this earlier today. def getUniqueElements(array): '''Returns a new list of only the unique elements of a list. Assumes that the list is sorted. ''' unique = [] last = None for item in array: if item == last: continue last = item unique.append(item) return unique # A simple demo of writing to a file. def writeOutput(filename, array): '''Writes each element of a list to the specified file. Assumes elements in the list can be turned into strings. ''' outfile = open(filename, 'w') # 'w' stands for writing, cannot be omitted for item in array: outfile.write(str(item) + '\n') print(len(array), 'lines written to file', filename) if __name__ == '__main__': main()