Subset pandas DataFrame based on two columns ignoring what order the match happens in

up vote
2
down vote

favorite

I have two Pandas DataFrames and I want to subset df_all based on the values within to_keep. Unfortunately this isn't straight forward pd.merge() or df.join() because I have multiple columns that I want to match on, and I don't care what order the match happens.

I don't care if df_all['source'] matches in either to_keep['from'] OR 'to_keep['to']

And then df_all['target'] matches in either to_keep['from'] OR to_keep['to'].

What I have below currently works, but it seems like a lot of work and hopefully this operation could be optimized.

import pandas as pd

import numpy as np



# create sample dataframe

df_all = pd.DataFrame({'from': ['a', 'a', 'b', 'a', 'b', 'c', 'd', 'd', 'd'], 

                       'to': ['b', 'b', 'a', 'c', 'c', 'd', 'c', 'f', 'e'], 

                       'time': np.random.randint(50, size=9),

                       'category': np.random.randn(9)

                       })



# create a key based on from & to

df_all['key'] = df_all['from'] + '-' + df_all['to']



df_all



    category    from    time    to  key

0   0.374312    a   38  b   a-b

1   -0.425700   a   0   b   a-b

2   0.928008    b   34  a   b-a

3   -0.160849   a   44  c   a-c

4   0.462712    b   4   c   b-c

5   -0.223074   c   33  d   c-d

6   -0.778988   d   47  c   d-c

7   -1.392306   d   0   f   d-f

8   0.910363    d   34  e   d-e



# create another sample datframe

to_keep = pd.DataFrame({'source': ['a', 'a', 'b'], 

                        'target': ['b', 'c', 'c'] 

                       })



to_keep



    source  target

0   a   b

1   a   c

2   b   c



# create a copy of to_keep

to_keep_flipped = to_keep.copy()



# flip source and target column names

to_keep_flipped.rename(columns={'source': 'target', 'target': 'source'}, inplace=True)



# extend to_keep with flipped version

to_keep_all = pd.concat([to_keep, to_keep_flipped], ignore_index=True)



to_keep_all



    source  target

0   a   b

1   a   c

2   b   c

3   b   a

4   c   a

5   c   b



# create a key based on source & target

keys = to_keep_all['source'] + '-' + to_keep_all['target']



keys



0    a-b

1    a-c

2    b-c

3    b-a

4    c-a

5    c-b

dtype: object



df_all[df_all['key'].isin(keys)]



    category    from    time    to  key

0   0.374312    a   38  b   a-b

1   -0.425700   a   0   b   a-b

2   0.928008    b   34  a   b-a

3   -0.160849   a   44  c   a-c

4   0.462712    b   4   c   b-c

edited Jun 16 at 2:42

asked Jun 15 at 18:57

CurtLH

3051313

add a comment |

up vote
2
down vote

favorite

I don't care if df_all['source'] matches in either to_keep['from'] OR 'to_keep['to']

And then df_all['target'] matches in either to_keep['from'] OR to_keep['to'].

What I have below currently works, but it seems like a lot of work and hopefully this operation could be optimized.

import pandas as pd

import numpy as np



# create sample dataframe

df_all = pd.DataFrame({'from': ['a', 'a', 'b', 'a', 'b', 'c', 'd', 'd', 'd'], 

                       'to': ['b', 'b', 'a', 'c', 'c', 'd', 'c', 'f', 'e'], 

                       'time': np.random.randint(50, size=9),

                       'category': np.random.randn(9)

                       })



# create a key based on from & to

df_all['key'] = df_all['from'] + '-' + df_all['to']



df_all



    category    from    time    to  key

0   0.374312    a   38  b   a-b

1   -0.425700   a   0   b   a-b

2   0.928008    b   34  a   b-a

3   -0.160849   a   44  c   a-c

4   0.462712    b   4   c   b-c

5   -0.223074   c   33  d   c-d

6   -0.778988   d   47  c   d-c

7   -1.392306   d   0   f   d-f

8   0.910363    d   34  e   d-e



# create another sample datframe

to_keep = pd.DataFrame({'source': ['a', 'a', 'b'], 

                        'target': ['b', 'c', 'c'] 

                       })



to_keep



    source  target

0   a   b

1   a   c

2   b   c



# create a copy of to_keep

to_keep_flipped = to_keep.copy()



# flip source and target column names

to_keep_flipped.rename(columns={'source': 'target', 'target': 'source'}, inplace=True)



# extend to_keep with flipped version

to_keep_all = pd.concat([to_keep, to_keep_flipped], ignore_index=True)



to_keep_all



    source  target

0   a   b

1   a   c

2   b   c

3   b   a

4   c   a

5   c   b



# create a key based on source & target

keys = to_keep_all['source'] + '-' + to_keep_all['target']



keys



0    a-b

1    a-c

2    b-c

3    b-a

4    c-a

5    c-b

dtype: object



df_all[df_all['key'].isin(keys)]



    category    from    time    to  key

0   0.374312    a   38  b   a-b

1   -0.425700   a   0   b   a-b

2   0.928008    b   34  a   b-a

3   -0.160849   a   44  c   a-c

4   0.462712    b   4   c   b-c

edited Jun 16 at 2:42

asked Jun 15 at 18:57

CurtLH

3051313

add a comment |

up vote
2
down vote

favorite

I don't care if df_all['source'] matches in either to_keep['from'] OR 'to_keep['to']

And then df_all['target'] matches in either to_keep['from'] OR to_keep['to'].

What I have below currently works, but it seems like a lot of work and hopefully this operation could be optimized.

import pandas as pd

import numpy as np



# create sample dataframe

df_all = pd.DataFrame({'from': ['a', 'a', 'b', 'a', 'b', 'c', 'd', 'd', 'd'], 

                       'to': ['b', 'b', 'a', 'c', 'c', 'd', 'c', 'f', 'e'], 

                       'time': np.random.randint(50, size=9),

                       'category': np.random.randn(9)

                       })



# create a key based on from & to

df_all['key'] = df_all['from'] + '-' + df_all['to']



df_all



    category    from    time    to  key

0   0.374312    a   38  b   a-b

1   -0.425700   a   0   b   a-b

2   0.928008    b   34  a   b-a

3   -0.160849   a   44  c   a-c

4   0.462712    b   4   c   b-c

5   -0.223074   c   33  d   c-d

6   -0.778988   d   47  c   d-c

7   -1.392306   d   0   f   d-f

8   0.910363    d   34  e   d-e



# create another sample datframe

to_keep = pd.DataFrame({'source': ['a', 'a', 'b'], 

                        'target': ['b', 'c', 'c'] 

                       })



to_keep



    source  target

0   a   b

1   a   c

2   b   c



# create a copy of to_keep

to_keep_flipped = to_keep.copy()



# flip source and target column names

to_keep_flipped.rename(columns={'source': 'target', 'target': 'source'}, inplace=True)



# extend to_keep with flipped version

to_keep_all = pd.concat([to_keep, to_keep_flipped], ignore_index=True)



to_keep_all



    source  target

0   a   b

1   a   c

2   b   c

3   b   a

4   c   a

5   c   b



# create a key based on source & target

keys = to_keep_all['source'] + '-' + to_keep_all['target']



keys



0    a-b

1    a-c

2    b-c

3    b-a

4    c-a

5    c-b

dtype: object



df_all[df_all['key'].isin(keys)]



    category    from    time    to  key

0   0.374312    a   38  b   a-b

1   -0.425700   a   0   b   a-b

2   0.928008    b   34  a   b-a

3   -0.160849   a   44  c   a-c

4   0.462712    b   4   c   b-c

edited Jun 16 at 2:42

asked Jun 15 at 18:57

CurtLH

3051313

I don't care if df_all['source'] matches in either to_keep['from'] OR 'to_keep['to']

And then df_all['target'] matches in either to_keep['from'] OR to_keep['to'].

What I have below currently works, but it seems like a lot of work and hopefully this operation could be optimized.

import pandas as pd

import numpy as np



# create sample dataframe

df_all = pd.DataFrame({'from': ['a', 'a', 'b', 'a', 'b', 'c', 'd', 'd', 'd'], 

                       'to': ['b', 'b', 'a', 'c', 'c', 'd', 'c', 'f', 'e'], 

                       'time': np.random.randint(50, size=9),

                       'category': np.random.randn(9)

                       })



# create a key based on from & to

df_all['key'] = df_all['from'] + '-' + df_all['to']



df_all



    category    from    time    to  key

0   0.374312    a   38  b   a-b

1   -0.425700   a   0   b   a-b

2   0.928008    b   34  a   b-a

3   -0.160849   a   44  c   a-c

4   0.462712    b   4   c   b-c

5   -0.223074   c   33  d   c-d

6   -0.778988   d   47  c   d-c

7   -1.392306   d   0   f   d-f

8   0.910363    d   34  e   d-e



# create another sample datframe

to_keep = pd.DataFrame({'source': ['a', 'a', 'b'], 

                        'target': ['b', 'c', 'c'] 

                       })



to_keep



    source  target

0   a   b

1   a   c

2   b   c



# create a copy of to_keep

to_keep_flipped = to_keep.copy()



# flip source and target column names

to_keep_flipped.rename(columns={'source': 'target', 'target': 'source'}, inplace=True)



# extend to_keep with flipped version

to_keep_all = pd.concat([to_keep, to_keep_flipped], ignore_index=True)



to_keep_all



    source  target

0   a   b

1   a   c

2   b   c

3   b   a

4   c   a

5   c   b



# create a key based on source & target

keys = to_keep_all['source'] + '-' + to_keep_all['target']



keys



0    a-b

1    a-c

2    b-c

3    b-a

4    c-a

5    c-b

dtype: object



df_all[df_all['key'].isin(keys)]



    category    from    time    to  key

0   0.374312    a   38  b   a-b

1   -0.425700   a   0   b   a-b

2   0.928008    b   34  a   b-a

3   -0.160849   a   44  c   a-c

4   0.462712    b   4   c   b-c

python pandas

edited Jun 16 at 2:42

asked Jun 15 at 18:57

CurtLH

3051313

edited Jun 16 at 2:42

asked Jun 15 at 18:57

CurtLH

3051313

edited Jun 16 at 2:42

asked Jun 15 at 18:57

CurtLH

3051313

asked Jun 15 at 18:57

CurtLH

3051313

asked Jun 15 at 18:57

CurtLH

3051313

add a comment |

1 Answer
1

active

oldest

votes

up vote
0
down vote

First, some comments on your question-asking: in your initial statement, you talk about matching 'col1' etc., but in your actual code, you have 'from, 'to', 'source', and 'target'. You talk about subsetting, but then you talk about pd.merge(), and those are completely different things. You formatting is poor, resulting your column names not lining up with the actual columns when you show the output in your code. When you generate random sample data, you should set a seed so that people will get the same data, and can check whether their code is doing the same thing as yours. Your test data is poorly chosen. For instance, does row for which 'from' matches one of the to_keep columns have to match the row in which 'to' matches, or can they match different rows? Can they both match the same column (e.g. 'from' and 'to' both match 'target'), or do they have to match different columns. Neither your test cases nor your problem description are clear on those points; one has to go through your code to figure out what you mean.

Assuming that they have match on the same row, and in different columns, this code should work:

def check(row):

    forward = (to_keep['source'] == row['from']) & (to_keep['target'] == row['to'])

    reverse = (to_keep['source'] == row['to']) & (to_keep['target'] == row['from'])

    return any(forward) | any(reverse)





kept_df = df_all.loc[[check(row) for row in df_all.iterrows()]]

edited Jun 15 at 22:31

answered Jun 15 at 21:59

Acccumulation

1,03915

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f196591%2fsubset-pandas-dataframe-based-on-two-columns-ignoring-what-order-the-match-happe%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

up vote
0
down vote

Assuming that they have match on the same row, and in different columns, this code should work:

def check(row):

    forward = (to_keep['source'] == row['from']) & (to_keep['target'] == row['to'])

    reverse = (to_keep['source'] == row['to']) & (to_keep['target'] == row['from'])

    return any(forward) | any(reverse)





kept_df = df_all.loc[[check(row) for row in df_all.iterrows()]]

edited Jun 15 at 22:31

answered Jun 15 at 21:59

Acccumulation

1,03915

add a comment |

up vote
0
down vote

Assuming that they have match on the same row, and in different columns, this code should work:

def check(row):

    forward = (to_keep['source'] == row['from']) & (to_keep['target'] == row['to'])

    reverse = (to_keep['source'] == row['to']) & (to_keep['target'] == row['from'])

    return any(forward) | any(reverse)





kept_df = df_all.loc[[check(row) for row in df_all.iterrows()]]

edited Jun 15 at 22:31

answered Jun 15 at 21:59

Acccumulation

1,03915

add a comment |

up vote
0
down vote

Assuming that they have match on the same row, and in different columns, this code should work:

def check(row):

    forward = (to_keep['source'] == row['from']) & (to_keep['target'] == row['to'])

    reverse = (to_keep['source'] == row['to']) & (to_keep['target'] == row['from'])

    return any(forward) | any(reverse)





kept_df = df_all.loc[[check(row) for row in df_all.iterrows()]]

edited Jun 15 at 22:31

answered Jun 15 at 21:59

Acccumulation

1,03915

Assuming that they have match on the same row, and in different columns, this code should work:

def check(row):

    forward = (to_keep['source'] == row['from']) & (to_keep['target'] == row['to'])

    reverse = (to_keep['source'] == row['to']) & (to_keep['target'] == row['from'])

    return any(forward) | any(reverse)





kept_df = df_all.loc[[check(row) for row in df_all.iterrows()]]

edited Jun 15 at 22:31

answered Jun 15 at 21:59

Acccumulation

1,03915

edited Jun 15 at 22:31

answered Jun 15 at 21:59

Acccumulation

1,03915

answered Jun 15 at 21:59

Acccumulation

1,03915

answered Jun 15 at 21:59

Acccumulation

1,03915

add a comment |

draft saved

draft discarded

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Gfrktyl