Skipping Unknown Number Of Lines To Read The Header Python Pandas
i have an excel data that i read in with python pandas: import pandas as pd data = pd.read_csv('..../file.txt', sep='\t' ) the mock data looks like this: unwantedjunkline1 unwante
Solution 1:
If you know what the header startswith:
def skip_to(fle, line,**kwargs):
if os.stat(fle).st_size == 0:
raise ValueError("File is empty")
with open(fle) as f:
pos = 0
cur_line = f.readline()
while not cur_line.startswith(line):
pos = f.tell()
cur_line = f.readline()
f.seek(pos)
return pd.read_csv(f, **kwargs)
Demo:
In [18]: cat test.txt
1,2
3,4
The,header
foo,bar
foobar,foo
In [19]: df = skip_to("test.txt","The,header", sep=",")
In [20]: df
Out[20]:
The header
0 foo bar
1 foobar foo
By calling .tell
we keep track of where the pointer is for the previous line so when we hit the header we seek back to that line and just pass the file object to pandas.
Or using the junk if they all started with something in common:
def skip_to(fle, junk,**kwargs):
if os.stat(fle).st_size == 0:
raise ValueError("File is empty")
with open(fle) as f:
pos = 0
cur_line = f.readline()
while cur_line.startswith(junk):
pos = f.tell()
cur_line = f.readline()
f.seek(pos)
return pd.read_csv(f, **kwargs)
df = skip_to("test.txt", "junk",sep="\t")
Solution 2:
Another simple way to achieve a dynamic skiprows would something like this which worked for me:
# Open the filewithopen('test.csv', encoding='utf-8') as readfile:
ls_readfile = readfile.readlines()
#Find the skiprows number with ID as the startswith
skip = next(filter(lambda x: x[1].startswith('ID'), enumerate(ls_readfile)))[0]
print(skip)
#import the file with the separator \t
df = pd.read_csv(r'test.txt', skiprows=skip, sep ='\t')
Post a Comment for "Skipping Unknown Number Of Lines To Read The Header Python Pandas"