Since your files are sorted, you don’t have to store them or even parse them outside of lines. You can keep advancing in the sequence with the lesser current element:
def count_equal(a, b):
"""
Counts the number of values that are equal in two sorted iterables.
>>> odds = [1, 3, 5, 7, 9, 11, 13, 15]
>>> primes = [2, 3, 5, 7, 11, 13]
>>> count_equal(odds, primes)
5
"""
return _count_equal(iter(a), iter(b))
def _count_equal(a, b):
c = 0
x = next(a)
y = next(b)
try:
while True:
while x < y:
x = next(a)
while y < x:
y = next(b)
if x == y:
c += 1
x = next(a)
y = next(b)
except StopIteration:
return c
You can keep track of how many lines there are in each file separately in the same read:
from __future__ import division
class CountingIterable:
def __init__(self, iterable):
self.iterable = iterable
def __iter__(self):
count = 0
for x in self.iterable:
yield x
count += 1
self.count = count
with open('file1.txt', 'r') as a, open('file2.txt', 'r') as b:
a_counter = CountingIterable(a)
b_counter = CountingIterable(b)
a_iterator = iter(a_counter)
b_iterator = iter(b_counter)
n = count_equal(a_iterator, b_iterator)
# consume any remaining elements to acquire count
for _ in a_iterator: pass
for _ in b_iterator: pass
result = n / max(a_counter.count, b_counter.count)
1
solved How to efficiently compare two maps?