Your advanced code editor is loading.

Please wait a moment.

#editor-loading { display: none; }
{
          
          
“cells”: [
{
“cell_type”: “markdown”,
“metadata”: {},
“source”: [
“## Linking in Spark\n”,
“\n”
]
},
{
“cell_type”: “code”,
“execution_count”: 1,
“metadata”: {},
“outputs”: [
{
“name”: “stderr”,
“output_type”: “stream”,
“text”: [
“23/06/22 16:31:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform… using builtin-java classes where applicable\n”,
“Setting default log level to “WARN”.\n”,
“To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n”,
“23/06/22 16:31:36 WARN Utils: Service ‘SparkUI’ could not bind on port 4040. Attempting port 4041.\n”
]
}
],
“source”: [
“from splink.spark.jar_location import similarity_jar_location\n”,
“\n”,
“from pyspark import SparkContext, SparkConf\n”,
“from pyspark.sql import SparkSession\n”,
“from pyspark.sql import types\n”,
“\n”,
“conf = SparkConf()\n”,
“# This parallelism setting is only suitable for a small toy example\n”,
“conf.set(“spark.driver.memory”, “12g”)\n”,
“conf.set(“spark.default.parallelism”, “16”)\n”,
“\n”,
“\n”,
“# Add custom similarity functions, which are bundled with Splink\n”,
“# documented here: https://github.com/moj-analytical-services/splink_scalaudfs\n”,
“path = similarity_jar_location()\n”,
“conf.set(“spark.jars”, path)\n”,
“\n”,
“sc = SparkContext.getOrCreate(conf=conf)\n”,
“\n”,
“spark = SparkSession(sc)\n”,
“spark.sparkContext.setCheckpointDir(“./tmp_checkpoints”)”
]
},
{
“cell_type”: “code”,
“execution_count”: 2,
“metadata”: {},
“outputs”: [
{
“name”: “stderr”,
“output_type”: “stream”,
“text”: [
”                                                                                \r”
]
}
],
“source”: [
“import pandas as pd \n”,
“df = spark.read.csv(“../../data/fake_1000.csv”, header=True)”
]
},
{
“cell_type”: “code”,
“execution_count”: 3,
“metadata”: {},
“outputs”: [],
“source”: [
“import splink.spark.comparison_library as cl\n”,
“import splink.spark.comparison_template_library as ctl\n”,
“\n”,
“settings = {\n”,
”    “link_type”: “dedupe_only”,\n”,
”    “comparisons”: [\n”,
”        ctl.name_comparison(“first_name”),\n”,
”        ctl.name_comparison(“surname”),\n”,
”        ctl.date_comparison(“dob”, cast_strings_to_date=True),\n”,
”        cl.exact_match(“city”, term_frequency_adjustments=True),\n”,
”        ctl.email_comparison(“email”),\n”,
”    ],\n”,
”    “blocking_rules_to_generate_predictions”: [\n”,
”        “l.first_name = r.first_name”,\n”,
”        “l.surname = r.surname”,\n”,
”    ],\n”,
”    “retain_matching_columns”: True,\n”,
”    “retain_intermediate_calculation_columns”: True,\n”,
”    “em_convergence”: 0.01\n”,
“}”
]
},
{
“cell_type”: “code”,
“execution_count”: 4,
“metadata”: {},
“outputs”: [
{
“name”: “stderr”,
“output_type”: “stream”,
“text”: [
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/dataframe.py:169: UserWarning: DataFrame.sql_ctx is an internal property, and will be removed in future releases. Use DataFrame.sparkSession instead.\n”,
”  warnings.warn(\n”,
“–WARN– \n”,
” You are using datediff comparison\n”,
”                        with str-casting and ANSI is not enabled. Bad dates\n”,
”                        e.g. 1999-13-54 will not trigger an exception but will\n”,
”                        classed as comparison level = “ELSE”. Ensure date strings\n”,
”                        are cleaned to remove bad dates \n”,
“\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“Probability two random records match is estimated to be  0.00389.\n”,
“This means that amongst all possible pairwise record comparisons, one in 257.25 are expected to match.  With 499,500 total possible comparisons, we expect a total of around 1,941.67 matching pairs\n”
]
}
],
“source”: [
“from splink.spark.linker import SparkLinker\n”,
“linker = SparkLinker(df, settings)\n”,
“deterministic_rules = [\n”,
”    “l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1”,\n”,
”    “l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1”,\n”,
”    “l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2”,\n”,
”    “l.email = r.email”\n”,
”]\n”,
“\n”,
“linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.6)\n”
]
},
{
“cell_type”: “code”,
“execution_count”: 5,
“metadata”: {},
“outputs”: [
{
“name”: “stderr”,
“output_type”: “stream”,
“text”: [
“—– Estimating u probabilities using random sampling —–\n”,
“23/06/22 16:31:50 WARN DataSource: All paths were ignored:                      \n”,
”  file:/Users/rosskennedy/splink_demos/examples/spark/tmp_checkpoints/86781ded-f9ca-4a1c-9ce5-8f04c3ae497d/__splink__df_concat_with_tf_88a64498f\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
”                                                                                \n”,
“Estimated u probabilities using random sampling\n”,
“\n”,
“Your model is not yet fully trained. Missing estimates for:\n”,
”    - first_name (no m values are trained).\n”,
”    - surname (no m values are trained).\n”,
”    - dob (no m values are trained).\n”,
”    - city (no m values are trained).\n”,
”    - email (no m values are trained).\n”
]
}
],
“source”: [
“linker.estimate_u_using_random_sampling(max_pairs=5e5)”
]
},
{
“cell_type”: “code”,
“execution_count”: 6,
“metadata”: {},
“outputs”: [
{
“name”: “stderr”,
“output_type”: “stream”,
“text”: [
“\n”,
“—– Starting EM training session —–\n”,
“\n”,
“Estimating the m probabilities of the model by blocking on:\n”,
“l.first_name = r.first_name and l.surname = r.surname\n”,
“\n”,
“Parameter estimates will be made for the following comparison(s):\n”,
”    - dob\n”,
”    - city\n”,
”    - email\n”,
“\n”,
“Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n”,
”    - first_name\n”,
”    - surname\n”,
“23/06/22 16:32:01 WARN DataSource: All paths were ignored:                      \n”,
”  file:/Users/rosskennedy/splink_demos/examples/spark/tmp_checkpoints/86781ded-f9ca-4a1c-9ce5-8f04c3ae497d/__splink__df_comparison_vectors_e68b1381e\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“\n”,
“WARNING:\n”,
“Level Jaro_winkler Username >= 0.88 on comparison email not observed in dataset, unable to train m value\n”,
“Iteration 1: Largest change in params was -0.54 in the m_probability of dob, level Exact match\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“\n”,
“WARNING:\n”,
“Level Jaro_winkler Username >= 0.88 on comparison email not observed in dataset, unable to train m value\n”,
“Iteration 2: Largest change in params was 0.0362 in probability_two_random_records_match\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“\n”,
“WARNING:\n”,
“Level Jaro_winkler Username >= 0.88 on comparison email not observed in dataset, unable to train m value\n”,
“Iteration 3: Largest change in params was 0.00798 in probability_two_random_records_match\n”,
“\n”,
“EM converged after 3 iterations\n”,
“m probability not trained for email - Jaro_winkler Username >= 0.88 (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n”,
“\n”,
“Your model is not yet fully trained. Missing estimates for:\n”,
”    - first_name (no m values are trained).\n”,
”    - surname (no m values are trained).\n”,
”    - email (some m values are not trained).\n”,
“\n”,
“—– Starting EM training session —–\n”,
“\n”,
“Estimating the m probabilities of the model by blocking on:\n”,
“l.dob = r.dob\n”,
“\n”,
“Parameter estimates will be made for the following comparison(s):\n”,
”    - first_name\n”,
”    - surname\n”,
”    - city\n”,
”    - email\n”,
“\n”,
“Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n”,
”    - dob\n”,
“23/06/22 16:32:06 WARN DataSource: All paths were ignored:                      \n”,
”  file:/Users/rosskennedy/splink_demos/examples/spark/tmp_checkpoints/86781ded-f9ca-4a1c-9ce5-8f04c3ae497d/__splink__df_comparison_vectors_5ba0cbd7b\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“\n”,
“WARNING:\n”,
“Level Jaro_winkler Username >= 0.88 on comparison email not observed in dataset, unable to train m value\n”,
“Iteration 1: Largest change in params was -0.414 in the m_probability of surname, level Exact match surname\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“\n”,
“WARNING:\n”,
“Level Jaro_winkler Username >= 0.88 on comparison email not observed in dataset, unable to train m value\n”,
“Iteration 2: Largest change in params was 0.11 in the m_probability of first_name, level All other comparisons\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“\n”,
“WARNING:\n”,
“Level Jaro_winkler Username >= 0.88 on comparison email not observed in dataset, unable to train m value\n”,
“Iteration 3: Largest change in params was 0.0384 in probability_two_random_records_match\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“\n”,
“WARNING:\n”,
“Level Jaro_winkler Username >= 0.88 on comparison email not observed in dataset, unable to train m value\n”,
“Iteration 4: Largest change in params was 0.0138 in probability_two_random_records_match\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“\n”,
“WARNING:\n”,
“Level Jaro_winkler Username >= 0.88 on comparison email not observed in dataset, unable to train m value\n”,
“Iteration 5: Largest change in params was 0.00581 in probability_two_random_records_match\n”,
“\n”,
“EM converged after 5 iterations\n”,
“m probability not trained for email - Jaro_winkler Username >= 0.88 (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n”,
“\n”,
“Your model is not yet fully trained. Missing estimates for:\n”,
”    - email (some m values are not trained).\n”
]
}
],
“source”: [
“training_blocking_rule = “l.first_name = r.first_name and l.surname = r.surname”\n”,
“training_session_fname_sname = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n”,
“\n”,
“training_blocking_rule = “l.dob = r.dob”\n”,
“training_session_dob = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)”
]
},
{
“cell_type”: “code”,
“execution_count”: 7,
“metadata”: {},
“outputs”: [
{
“name”: “stderr”,
“output_type”: “stream”,
“text”: [
“23/06/22 16:32:14 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting ‘spark.sql.debug.maxToStringFields’.\n”,
“23/06/22 16:32:17 WARN DataSource: All paths were ignored:                      \n”,
”  file:/Users/rosskennedy/splink_demos/examples/spark/tmp_checkpoints/86781ded-f9ca-4a1c-9ce5-8f04c3ae497d/__splink__df_predict_0ae386ee5\n”,
“\n”,
” – WARNING –\n”,
“You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.\n”,
“Comparison: ‘email’:\n”,
”    m values not fully trained\n”
]
}
],
“source”: [
“results = linker.predict(threshold_match_probability=0.9)”
]
},
{
“cell_type”: “code”,
“execution_count”: 8,
“metadata”: {
“tags”: []
},
“outputs”: [
{
“name”: “stderr”,
“output_type”: “stream”,
“text”: [
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”,
“/Users/rosskennedy/Library/r-miniconda/lib/python3.9/site-packages/pyspark/sql/pandas/utils.py:37: DeprecationWarning: distutils Version classes are deprecated. Use packaging.version instead.\n”,
”  if LooseVersion(pandas.version) < LooseVersion(minimum_pandas_version):\n”
]
},
{
“data”: {
“text/html”: [
“
\n”,
“\n”,
“\n”,
”  \n”,
”    \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”    \n”,
”  \n”,
”  \n”,
”    \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”    \n”,
”    \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”    \n”,
”    \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”    \n”,
”    \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”    \n”,
”    \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”      \n”,
”    \n”,
”  \n”,
“
match_weightmatch_probabilityunique_id_lunique_id_rfirst_name_lfirst_name_rgamma_first_namebf_first_namesurname_lsurname_r…gamma_citytf_city_ltf_city_rbf_citybf_tf_adj_cityemail_lemail_rgamma_emailbf_emailmatch_key
016.9147340.9999925356JaydenJayden485.028963BennettBennett…00.001230.015990.4629591.000000jb88@king.comjb88@king.com4252.5908560
18.4304300.997110486487HannahHannah485.028963DaviesNone…10.049200.0492010.2012191.120874Nonehannahd14@cardenas.com-11.0000000
214.1519690.999945708710MayaMaya485.028963CurtsiCurtis…-1NaN0.022141.0000001.000000mcurtis53@simpsoun.commcurtis53@simpson.com3212.9033390
316.8358260.999991683685RosieRosie485.028963nJohnstonJohnoston…10.008610.0086110.2012196.404996rosiej32@robinson-moran.netrosiej32@robinson-moran.net4252.5908560
410.4022810.999262259260OliverOliver485.028963HguehesHughes…-1NaN0.004921.0000001.000000ohughes@page-kim.infoNone-11.0000000
\n”,
“
5 rows × 28 columns
\n”,
“
”
],
“text/plain”: [
”   match_weight  match_probability unique_id_l unique_id_r first_name_l  \n”,
“0     16.914734           0.999992          53          56       Jayden   \n”,
“1      8.430430           0.997110         486         487       Hannah   \n”,
“2     14.151969           0.999945         708         710         Maya   \n”,
“3     16.835826           0.999991         683         685        Rosie   \n”,
“4     10.402281           0.999262         259         260       Oliver   \n”,
“\n”,
”  first_name_r  gamma_first_name  bf_first_name  surname_l  surname_r  …  \n”,
“0       Jayden                 4      85.028963    Bennett    Bennett  …   \n”,
“1       Hannah                 4      85.028963     Davies       None  …   \n”,
“2         Maya                 4      85.028963     Curtsi     Curtis  …   \n”,
“3        Rosie                 4      85.028963  nJohnston  Johnoston  …   \n”,
“4       Oliver                 4      85.028963    Hguehes     Hughes  …   \n”,
“\n”,
”   gamma_city  tf_city_l tf_city_r    bf_city  bf_tf_adj_city  \n”,
“0           0    0.00123   0.01599   0.462959        1.000000   \n”,
“1           1    0.04920   0.04920  10.201219        1.120874   \n”,
“2          -1        NaN   0.02214   1.000000        1.000000   \n”,
“3           1    0.00861   0.00861  10.201219        6.404996   \n”,
“4          -1        NaN   0.00492   1.000000        1.000000   \n”,
“\n”,
”                       email_l                      email_r gamma_email  \n”,
“0                jb88@king.com                jb88@king.com           4   \n”,
“1                         None       hannahd14@cardenas.com          -1   \n”,
“2       mcurtis53@simpsoun.com        mcurtis53@simpson.com           3   \n”,
“3  rosiej32@robinson-moran.net  rosiej32@robinson-moran.net           4   \n”,
“4        ohughes@page-kim.info                         None          -1   \n”,
“\n”,
”     bf_email  match_key  \n”,
“0  252.590856          0  \n”,
“1    1.000000          0  \n”,
“2  212.903339          0  \n”,
“3  252.590856          0  \n”,
“4    1.000000          0  \n”,
“\n”,
“[5 rows x 28 columns]”
]
},
“execution_count”: 8,
“metadata”: {},
“output_type”: “execute_result”
}
],
“source”: [
“results.as_pandas_dataframe(limit=5)”
]
}
],
“metadata”: {
“kernelspec”: {
“display_name”: “Python 3 (ipykernel)”,
“language”: “python”,
“name”: “python3”
},
“language_info”: {
“codemirror_mode”: {
“name”: “ipython”,
“version”: 3
},
“file_extension”: “.py”,
“mimetype”: “text/x-python”,
“name”: “python”,
“nbconvert_exporter”: “python”,
“pygments_lexer”: “ipython3”,
“version”: “3.9.2”
}
},
“nbformat”: 4,
“nbformat_minor”: 4
}
Your advanced code editor is loading.

Please wait a moment.

Explanation

Graph

Symbols

We couldn't identify any entrypoints. If you believe this to be incorrect then please contact support.