Pandas Combining Rows Based On Dates
Solution 1:
Fundamentally, I think this is a graph connectivity problem: a fast way of solving it will be some manner of graph connectivity algorithm. Pandas doesn't include such tools, but scipy does. You can use the compressed sparse graph (csgraph
) submodule in scipy to solve your problem like this:
from scipy.sparse.csgraph import connected_components
# convert to datetime, so min() and max() work
df.startDate = pd.to_datetime(df.startDate)
df.endDate = pd.to_datetime(df.endDate)
defreductionFunction(data):
# create a 2D graph of connectivity between date ranges
start = data.startDate.values
end = data.endDate.values
graph = (start <= end[:, None]) & (end >= start[:, None])
# find connected components in this graph
n_components, indices = connected_components(graph)
# group the results by these connected componentsreturn data.groupby(indices).aggregate({'startDate': 'min',
'endDate': 'max',
'shipNo': 'first'})
df.groupby(['Cust']).apply(reductionFunction).reset_index('Cust')
If you want to do something different with shipNo
from here, it should be pretty straightforward.
Note that the connected_components()
function above is not brute force, but uses a fast algorithm to find the connections.
Solution 2:
I used below a bit messy but does it Faster.
Thanks to Anurag Dabas for the help.
I merge df on its own and check the overlaps then remove extra rows, do this till there is no overlaps left.
Please note that I added all "shipNo".
import pandas as pd
import numpy as np
df = pd.DataFrame([['A','2011-02-07','2011-02-22',1],['A','2011-02-14','2011-03-10',2],['A','2011-03-07','2011-03-15',3],['A','2011-03-18','2011-03-25',4]], columns = ['Cust','startDate','endDate','shipNo'])
df['startDate'] = pd.to_datetime(df['startDate'])
df['endDate'] = pd.to_datetime(df['endDate'])
defoverlap_checker(data):
data['CuststartDateendDate']=data['Cust'].map(str)+data['startDate'].map(str)+data['endDate'].map(str)
df2=pd.merge(data,data,on='Cust')
df2['Overlap']=np.where((df2['startDate_x']<=df2['endDate_y'])&(df2['endDate_x']>=df2['startDate_y']) & (df2['CuststartDateendDate_x'] != df2['CuststartDateendDate_y']), 'Overlapped','not overlapped')
df2['startDate_x']=np.where(df2['Overlap'].eq('Overlapped'),df2[['startDate_x','startDate_y']].min(axis=1),df2['startDate_x'])
df2['endDate_x']=np.where(df2['Overlap'].eq('Overlapped'),df2[['endDate_x','endDate_y']].max(axis=1),df2['endDate_x'])
df2['shipNo']=df2['shipNo_x'].map(str)+df2['shipNo_y'].map(str)
df2['shipNo'] = df2['shipNo'].apply(lambda x: ' '.join(sorted(set(x))))
df2.rename(columns = {'startDate_x':'startDate','endDate_x':'endDate'}, inplace = True)
return df2, data
defoverlap_remover(df, data):
df2= df[(df['Overlap']=="Overlapped")]
data1=data[~data['CuststartDateendDate'].isin(df2['CuststartDateendDate_x'])]
df2 = df2.drop(columns=['startDate_y','endDate_y','Overlap','CuststartDateendDate_x','CuststartDateendDate_y','shipNo_x','shipNo_y'])
df2 = df2.drop_duplicates()
bigdata = data1.append(df2, ignore_index=True,sort=False)
return bigdata
dftmp, data = overlap_checker(df)
while dftmp['Overlap'].str.contains('Overlapped').any():
df = overlap_remover(dftmp,data)
dftmp, data = overlap_checker(df)
df = df.drop(columns=['CuststartDateendDate'])
df = df[['Cust','startDate','endDate','shipNo']]
print(df)
Solution 3:
If you are open to use an auxiliary data frame to hold the result, you can just loop through all the rows to be honest
from time import strptime
results = [df.iloc[0]]
for i, (_, current_row) inenumerate(df1.iterrows()):
try:
next_row = df.iloc[i+1]
if strptime(current_row['endDate'], '%Y-%M-%d') < strptime(next_row['startDate'], '%Y-%M-%d'):
results[-1]['endDate'] = current_row['endDate']
results.append(next_row)
except IndexError:
passprint pd.DataFrame(results).reset_index(drop=True)
Post a Comment for "Pandas Combining Rows Based On Dates"