A simple function to extract the Nth column from your text makes this reasonably straight-forward. I’ve assumed when you say “Column 11” you mean, the 11 column counting from 1, not the index-11 column where the 1st item is index-0
Pseudo-Code:
Until there's no data left ~
Read line1 from file1
Read line2 from file2
Extract Col11 from line1 as a real number
Extract Col2 & Col3 from line2 as real numbers
IF Col11 is within Col2 & Col3
do something
Python Code:
import sys
# Given a space-separated row of data, return the Nth column as a real number
def getNthColumn(row, N):
# Single-space the row, removing tabs, double-spaces etc.
row = ' '.join(row.split())
fields = row.split(' ')
result = float(fields[N-1]) # fields are numbered 0->(N-1)
#print("Returning column %d from [%s] -> %f" % (N, row, result))
return result
if (len(sys.argv) == 3):
fin1 = open(sys.argv[1], "rt")
fin2 = open(sys.argv[2], "rt") #TODO - handle file-not-found errors, etc.
line1 = fin1.readline()
line2 = fin2.readline()
while (line1 != "" and line2 != ""):
# Get the columns from the two lines
f1_col11 = getNthColumn(line1, 11)
f2_col2 = getNthColumn(line2, 2)
f2_col3 = getNthColumn(line2, 3) ### TODO handle errors
# work out if it's a keeper
# print("Is %f >= %f and %f <= %f" % (f1_col11, f2_col2, f1_col11, f2_col3))
if (f1_col11 >= f2_col2 and f1_col11 <= f2_col3):
print("MATCH: "+line1)
else:
print("NO-MATCH: "+line1)
# Next rows
line1 = fin1.readline()
line2 = fin2.readline()
else:
print("Give 2 files as arguments")
To be honest, if speed really is critical for this, it would be better to write it in a compiled language, e.g.: C/C++/Pascal, etc. etc.
EDIT: tested and working, added some debug print()s
EDIT2: Search file1-row against all rows in file2
import sys
# Hold all the file2 Columns
file2_col23 = []
# Given a space-separated row of data, return the Nth column as a real number
def getNthColumn(row, N):
# Single-space the row, removing tabs, double-spaces etc.
row = ' '.join(row.split())
fields = row.split(' ')
try:
result = float(fields[N-1]) # fields are numbered 0->(N-1)
except:
sys.stderr.write("Failed to fetch number column %d from [%s]" % (N, row))
sys.exit(1)
#print("Returning column %d from [%s] -> %f" % (N, row, result))
return result
if (len(sys.argv) == 3):
fin1 = open(sys.argv[1], "rt")
fin2 = open(sys.argv[2], "rt") #TODO - handle file-not-found errors, etc.
# Load in the whole of file2, but just the column2 & column3
# note the minimum col2 and maximum c3
line2 = fin2.readline()
min_c2 = None
max_c3 = None
while (line2 != ""):
col2 = getNthColumn(line2, 2)
col3 = getNthColumn(line2, 3)
file2_col23.append( ( col2, col3 ) )
# Note the min c2 and max c3 so we can quickly know if a search can
# possible produce a result
if (min_c2 == None or col2 < min_c2):
min_c2 = col2
if (max_c3 == None or col3 > max_c3):
max_c3 = col3
# next line
line2 = fin2.readline().strip()
# sort the columns to allow us to short-cut searching
file2_col23.sort()
line1 = fin1.readline()
while (line1 != ""):
col11 = getNthColumn(line1, 11)
matched = False
# is col11 is within any file2 row col2 or col3
if (col11 >= min_c2 and col11 <= max_c3): # make sure the search is worthwhile
for col23 in file2_col23:
(col2, col3) = col23
if (col11 >= col2 and col11 <= col3):
matched = True
break
if (matched == True):
print("MATCH: "+str(line1))
else:
print("NO-MATCH: "+str(line1))
# Next row
line1 = fin1.readline()
else:
print("Give 2 files as arguments")
9
solved Extract rows having the 11th column values lies between 2nd and 3nd of a second file if 1st column matches