This is a notebook used to choose between several magnitude models.
It is outdated in the sense that the underlying code has changed since the notebook was last run, and it will require tweaks to get it to run, but the big picture has not changed.
The result of this notebook was to choose what I called the ad-hoc model, which uses a linear model determined with bright stars and an instrument response determined with faint stars. More recently, that model was superceded by a simpler model that is closer to astronomer expectations, by the ad-hoc model is nevertheless the best fit.
More recent versions of all functions in this notebook can be found in agasc_gaia.gaia_model.
In [1]:
from astropy.table import Table
import pandas as pd
import seaborn as sns
import matplotlib
matplotlib.rcParams['font.size'] = 22
sns.set_style("whitegrid")
pd.options.display.float_format = '{:,.3f}'.format
In [2]:
# required for the iterative imputer in the next line
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import PCA
In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
In [4]:
%matplotlib inline
In [7]:
data = Table.read('data/agasc-gsc-tycho-gaia-x-match-train.fits')
data['mag_bin'] = np.digitize(data['ave_gaia_mag'], np.linspace(0, 20, 41))
sel = (
    (np.abs(data['mag_aca'] - data['ave_gaia_mag']) < 5)
    & (data['neighbor_mag_weight'] > 0.1)
    & (data['n_mag_neighbors'] == 1)
    & ~data['phot_variable_flag']
    & (data['has_mag'] > 0)
)
data = data[sel]
data = data.to_pandas()
Plotting Functions¶
In [8]:
def plot_weights(train):
    fig=plt.figure(figsize=(16, 6))
    ax1 = plt.subplot(131)
    ax2 = plt.subplot(132, sharex=ax1, )
    ax3 = plt.subplot(133, sharex=ax1, sharey=ax2)
    axes = [ax1, ax2, ax3]
    plt.sca(axes[0])
    plt.hist(
        train['ave_gaia_mag'],
        bins=b,
        histtype='step'
    )
    plt.yscale('log')
    plt.title('Magnitude Histogram')
    plt.xlabel('Average Gaia Mag.')
    plt.xlim((5, 12.5))
    plt.sca(axes[1])
    plt.plot(
        train['ave_gaia_mag'],
        train['stat_weight'],
        '.'
    )
    plt.title('Statistical Weight')
    plt.xlabel('Average Gaia Mag.')
    plt.yscale('log')
    plt.xlim((5, 11.5))
    plt.sca(axes[2])
    plt.plot(
        train['ave_gaia_mag'],
        train['weight'],
        '.'
    )
    plt.title('Weight')
    plt.xlabel('Average Gaia Mag.')
    plt.yscale('log')
    plt.xlim((5, 12))
    plt.tight_layout()
    return fig, axes
    
def plot_res_scatter(train, test):
    fig, axes = plt.subplots(
        1, 2,
        figsize=(16, 4),
        sharey=True,
        sharex=True
    )
    plt.sca(axes[0])
    plt.scatter(train.mag_aca_obs, train['y_res'])
    plt.ylabel(r'mag_aca$_{pred}$ - mag_aca$_{true}$')
    plt.xlabel('mag_aca')
    plt.title('Train')
    plt.sca(axes[1])
    plt.scatter(test.mag_aca_obs, test['y_res'])
    plt.title('Test')
    plt.xlabel('mag_aca')
    return fig, axes
def plot_res_1d(train, test):
    # fig = plt.figure(figsize=(12, 8))
    fig = plt.figure(figsize=(9.75, 6.5))
    plt.hist(
        train['y_res'],
        density=True,
        bins=np.linspace(-2.5, 2.5, 120),
        histtype='step',
        label='Train',
        linewidth=2
    )
    plt.hist(
        test['y_res'],
        density=True,
        bins=np.linspace(-2.5, 2.5, 120),
        histtype='step',
        label='Test',
        linewidth=2
    )
    plt.ylabel('probability density')
    plt.xlabel('Predicted - observed magnitude')
    plt.title('Distribution of magnitude errors (test/train)')
    plt.yscale('log')
    plt.legend(loc='upper left')
    plt.tight_layout()
    return fig, axes
def plot_res_2d(train):
    fig = plt.figure(figsize=(16, 8))
    sns.histplot(
        train,
        x='ave_gaia_mag', y='y_res',
        ax=plt.gca(),
        bins=[
            np.linspace(5, 12, 81),
            np.linspace(-1., 1, 161)
        ]
    )
    plt.ylim((-0.2, 0.2))
    return fig
def plot_res_v_mag(residuals, t_residuals):
    fig, axes = plt.subplots(
        2, 1,
        figsize=(12, 10),
        sharex=True,
        gridspec_kw={'height_ratios': [3, 1]}
    )
    plt.suptitle(f'Residuals')
    plt.sca(axes[0])
    plt.errorbar(
        residuals['ave_gaia_mag'].values,
        residuals['y_res'].values,
        yerr=residuals['d_y_res'] / np.sqrt(residuals['n']),
        marker='o',
        linestyle='none',
        label='train',
    )
    plt.errorbar(
        t_residuals['ave_gaia_mag'].values,
        t_residuals['y_res'].values,
        yerr=t_residuals['d_y_res'] / np.sqrt(t_residuals['n']),
        marker='o',
        linestyle='none',
        label='test'
    )
    plt.ylabel('residuals')
    plt.ylim((-.3, 1.))
    plt.legend(loc='upper left', fontsize='x-small')
    plt.sca(axes[1])
    plt.scatter(
        residuals['ave_gaia_mag'].values,
        residuals['y_res'].values / residuals['d_y_res'],
        marker='o'
    )
    plt.scatter(
        t_residuals['ave_gaia_mag'].values,
        t_residuals['y_res'].values / t_residuals['d_y_res'],
        marker='o'
    )
    plt.ylabel('pull')
    plt.ylim((-2, 2))
    plt.xlabel('mean Gaia mag')
    plt.tight_layout()
    
    return fig, axes
Pipeline Definitions¶
In [9]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
def get_weights(train, plot=False):
    v, b = np.histogram(
        (train['g_mag'] + train['bp_mag'] + train['rp_mag']) / 3,
        bins=np.linspace(0, 20, 241),
    )
    m = v>=200
    v[~m] = v[m].min()
    w = 1/v
    i = np.digitize(train['ave_gaia_mag'], b) - 1
    train['stat_weight'] = 0.
    train['stat_weight'] = w[i]
    thr = 9.
    train.loc[i > np.digitize(thr, b), 'stat_weight'] = w[np.digitize(thr, b)]
    train['stat_weight'] /= np.max(train['stat_weight'])
    train['weight'] = (
        train['stat_weight'] 
        * train['mag_aca_err_obs'].min()/train['mag_aca_err_obs']
    )
    
    if plot:
        plot_weights(train)
    return train['weight']
class GaiaSimpleFeatures(TransformerMixin, BaseEstimator):
    def __init__(self, n=1):
        self.n = n
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        n_samples, n_features = X.shape
        XP = np.empty(shape=(n_samples, 1 + self.n), dtype=X.dtype)
        XP[:, 0] = np.mean(X, axis=1)
        # XP[:,1] = X[:,0] - X[:,1]
        XP[:,1] = X[:,2] - X[:,1]
        for i in range(self.n - 1):
            XP[:,2 + i] = (X[:,0] - X[:,1])**(2 + i)
        return XP
    def get_feature_names(self, input_features=None):
        names = ['mean_mag', 'color']
        for i in range(self.n - 1):
            names += [f'mean_mag_{i+2}']
        return names
    def get_feature_names_out(self, input_features=None):
        names = ['mean_mag', 'color']
        for i in range(self.n - 1):
            names += [f'mean_mag_{i+2}']
        return names
class Dummy(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        X, y = self._validate_data(X, y)
        print('FITTING')
        print('X:', X.shape, np.any(np.isnan(X)), np.min(X), np.min(X))
        print('y:', y.shape, np.any(np.isnan(y)), np.min(y), np.min(y))
        return self
    def transform(self, X):
        return X
    def get_feature_names(self, input_features=None):
        return []
    def get_feature_names_out(self, input_features=None):
        return []
class LogX:
    def __init__(self):
        pass
    def __call__(self, X:pd.DataFrame, y=None) -> pd.DataFrame:
        res = np.empty(shape=(X.shape[0], 3))
        res[:,0] = np.log(X[:,0])
        res[:,1] = X[:,1]
        res[:,2] = np.log(X[:,0])**2
        return res
# BaseEstimator adds get_params, set_params, _validate_data and other goodies
# RegressorMixin adds a standard score() member function
class AdHoc(RegressorMixin, BaseEstimator):
    def __init__(self, low_threshold=1e10, method=1):
        self.low_threshold = low_threshold
        self.method = method
    def fit(self, X, y=None, **kwargs):
        self.linear_ = LinearRegression()
        if 'sample_weight' in kwargs:
            sigma = 1./kwargs['sample_weight']
            sigma /= np.mean(sigma)
        else:
            sigma = None
        X, y = self._validate_data(X, y)
        m = X[:,0] < self.low_threshold
        self.linear_.fit(X[m], y[m])
        y2 = (self.linear_.predict(X) - y)
        
        self.mean_ = np.array([np.mean(X), np.mean(y2)])
        X = X - self.mean_[0]
        y2 = y2 - self.mean_[1]
        
        self.std_ = np.array([np.std(X), np.std(y2)])
        X /= self.std_[0]
        y2 /= self.std_[1]
        if self.method == 1:
            self.params_, self.covariance_ = scipy.optimize.curve_fit(self._broken_1, X.T[0], y2, p0=[0., 0.5, 0., 0.], sigma=sigma)
        elif self.method == 2:
            self.params_, self.covariance_ = scipy.optimize.curve_fit(self._broken_2, X.T[0], y2, p0=[0.5, 0.], sigma=sigma)
        return self
    def predict(self, X):
        broken = self._broken_1 if self.method == 1 else self._broken_2
        check_is_fitted(self)
        X = self._validate_data(X)
        y = self.linear_.predict(X)
        if self.method == 0:
            y_res = 0
        else:
            y_res = self.mean_[1] + self.std_[1] * broken((X.T[0] - self.mean_[0]) / self.std_[0], *self.params_)
        return y - y_res
    @staticmethod
    def _broken_1(x, b, point, m1, m2):
        y = np.empty(len(x))
        y[x < point] = b + m1 * x[x < point]
        y[x >= point] = b + m1 * (point) + m2 * (x[x >= point] - point)
        return y
    @staticmethod
    def _broken_2(x, point, m2):
        y = np.empty(len(x))
        y[x < point] = 0
        y[x >= point] = m2 * (x[x >= point] - point)
        return y
class Broken(RegressorMixin, BaseEstimator):
    def __init__(self, threshold=0):
        self.threshold = threshold
    def fit(self, X, y=None):
        self.linear_1_ = LinearRegression()
        self.linear_2_ = LinearRegression()
        
        X, y = self._validate_data(X, y)
        x = X[:,0]
        print(np.min(x), np.max(x), self.threshold)
        m = x < self.threshold
        print(np.count_nonzero(m))
        self.linear_1_.fit(X[m], y[m])
        m = x >= self.threshold
        print(np.count_nonzero(m))
        self.linear_2_.fit(X[m], y[m])
        return self
    def predict(self, X):
        check_is_fitted(self)
        X = self._validate_data(X)
        
        y = np.empty(len(X))
        m = X[:,0] < self.threshold
        if np.count_nonzero(m):
            y[m] = self.linear_1_.predict(X[m])
        m = X[:,0] >= self.threshold
        if np.count_nonzero(m):
            y[m] = self.linear_2_.predict(X[m])
        return y
# from sklearn.utils.estimator_checks import check_estimator
# check_estimator(AdHoc())
# check_estimator(Broken())
In [10]:
def linear_pca_1():
    pipeline = Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('pca', PCA(n_components=1)),
            ('linear', LinearRegression())
        ]
    )
    return pipeline
def linear_pca_2():
    return Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('pca', PCA(n_components=2)),
            ('linear', LinearRegression())
        ]
    )
    return pipeline
def linear_pca_3():
    pipeline = Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('pca', PCA(n_components=3)),
            ('linear', LinearRegression())
        ]
    )
    return pipeline
def ad_hoc():
    pipeline = Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('simple', GaiaSimpleFeatures()),
            ('ad_hoc', AdHoc(low_threshold=9)),
        ]
    )
    return pipeline
def simple():
    pipeline = Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('simple', GaiaSimpleFeatures()),
            ('linear', LinearRegression())
        ]
    )
    return pipeline
def simple2():
    pipeline = Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('simple', GaiaSimpleFeatures(n=2)),
            ('linear', LinearRegression())
        ]
    )
    return pipeline
def simple_log():
    from sklearn.compose import TransformedTargetRegressor
    pipeline = Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('simple', GaiaSimpleFeatures()),
            # ('scaler', StandardScaler()),
            ('log', FunctionTransformer(LogX())),
            # ('dummy', Dummy()),
            ('linear', TransformedTargetRegressor(
                regressor=LinearRegression(),
                func=np.log,
                inverse_func=np.exp,
                check_inverse=False
            ))
        ]
    )
    return pipeline
def rf():
    from sklearn.ensemble import RandomForestRegressor
    pipeline = Pipeline(
        steps=[
            ('imputer', IterativeImputer(max_iter=50)),
            ('scale', StandardScaler()),
            ('rf', RandomForestRegressor(n_estimators=20, min_samples_split=10))
        ]
    )
    return pipeline
In [11]:
pipelines = {
    'ad_hoc': {
        'description': 'Ad-Hoc',
        'pipeline': ad_hoc,
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'ad_hoc_w': {
        'description': 'Ad-Hoc',
        'pipeline': ad_hoc,
        'args': lambda d: {'ad_hoc__sample_weight': get_weights(d)},
        'select': lambda d: d,
    },
    'ad_hoc_2': {
        'description': 'Ad-Hoc',
        'pipeline': lambda : Pipeline(
            steps=[
                ('imputer', IterativeImputer(max_iter=50)),
                ('simple', GaiaSimpleFeatures()),
                ('ad_hoc', AdHoc(low_threshold=9, method=2)),
            ]
        ),
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'ad_hoc_3': {
        'description': 'Ad-Hoc',
        'pipeline': lambda : Pipeline(
            steps=[
                ('imputer', IterativeImputer(max_iter=50)),
                ('simple', GaiaSimpleFeatures()),
                ('ad_hoc', AdHoc(low_threshold=8, method=1)),
            ]
        ),
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'rf': {
        'description': 'Random Forest',
        'pipeline': rf,
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'linear_pca_1': {
        'description': '1d PCA',
        'pipeline': linear_pca_1,
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'linear_pca_2': {
        'description': '2d PCA',
        'pipeline': linear_pca_2,
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'linear_pca_3': {
        'description': '3d PCA',
        'pipeline': linear_pca_3,
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'simple': {
        'description': 'Simple, not weighted',
        'pipeline': simple,
        'args': lambda d: {},
        'select': lambda d: d,
    },
    'simple_9': {
        'description': 'Simple, 9 min mag, weighted',
        'pipeline': simple,
        'args': lambda d: {'linear__sample_weight': get_weights(d)},
        'select': lambda d: d[(d['ave_gaia_mag'] < 9.)].reset_index(),
    },
    'simple_10_3': {
        'description': 'Simple, 10.3 min mag, weighted',
        'pipeline': simple,
        'args': lambda d: {'linear__sample_weight': get_weights(d)},
        'select': lambda d: d[(d['ave_gaia_mag'] < 10.3)].reset_index(),
    },
    'simple_n2': {
        'description': 'Simple, pol-2, weighted',
        'pipeline': simple2,
        'args': lambda d: {'linear__sample_weight': get_weights(d)},
        'select': lambda d: d,
    },
    'simple_log': {
        'description': 'Simple, log',
        'pipeline': simple_log,
        'args': lambda d: {},
        'select': lambda d: d,
    },
}
In [12]:
def mse_low(y_true, y_pred):
    m = y_true < 7
    return np.mean((y_true[m] - y_pred[m])**2)
def mse_mid(y_true, y_pred):
    m = (y_true > 9) & (y_true < 10)
    return np.mean((y_true[m] - y_pred[m])**2)
def mse_hi(y_true, y_pred):
    m = y_true > 10.5
    return np.mean((y_true[m] - y_pred[m])**2)
def mse(y_true, y_pred):
    a = np.mean((y_true - y_pred)**2)
    return a
def mse_r(y_true, y_pred, bmin, bmax):
    m = (y_true > bmin) & (y_true < bmax)
    return np.mean((y_true[m] - y_pred[m])**2)
def var(y_true, y_pred):
    return np.var(y_true - y_pred)
def q90(y_true, y_pred):
    q5, q95 = np.quantile(y_true - y_pred, [0.05, 0.95])
    return q95 - q5
def q99(y_true, y_pred):
    q5, q95 = np.quantile(y_true - y_pred, [0.005, 0.995])
    return q95 - q5
def q90_r(y_true, y_pred, bmin, bmax):
    m = (y_true > bmin) & (y_true < bmax)
    q5, q95 = np.quantile(y_true[m] - y_pred[m], [0.05, 0.95])
    return q95 - q5
def q99_r(y_true, y_pred, bmin, bmax):
    m = (y_true > bmin) & (y_true < bmax)
    q5, q95 = np.quantile(y_true[m] - y_pred[m], [0.005, 0.995])
    return q95 - q5
def mean_mag_r(y_true, y_pred, bmin, bmax):
    m = (y_true > bmin) & (y_true < bmax)
    return np.mean(y_true[m])
In [13]:
pipeline = pipelines['ad_hoc']['pipeline']()
x_train = data[['g_mag', 'bp_mag', 'rp_mag']]
y_train = data[['mag_aca_obs']]['mag_aca_obs']
pipeline.fit(x_train, y_train)
Out[13]:
Pipeline(steps=[('imputer', IterativeImputer(max_iter=50)),
                ('simple', GaiaSimpleFeatures()),
                ('ad_hoc', AdHoc(low_threshold=9))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('imputer', IterativeImputer(max_iter=50)),
                ('simple', GaiaSimpleFeatures()),
                ('ad_hoc', AdHoc(low_threshold=9))])IterativeImputer(max_iter=50)
GaiaSimpleFeatures()
AdHoc(low_threshold=9)
In [14]:
y_pred = pipeline.predict(x_train)
y_true = y_train.values
In [15]:
from sklearn.metrics import make_scorer
scoring = {
    'mse_low': make_scorer(mse_low, greater_is_better=False),
    'mse_mid': make_scorer(mse_mid, greater_is_better=False),
    'mse_hi': make_scorer(mse_hi, greater_is_better=False),
    'mse': make_scorer(mse, greater_is_better=False),
    'q90': make_scorer(q90),
    'q99': make_scorer(q99),
}
bins = np.linspace(5.5, 11.5, 13)
for i, (bmin, bmax) in enumerate(zip(bins[:-1], bins[1:])):
    scoring[f'mse_bin_{i}'] = make_scorer(lambda y_true, y_pred, b1=bmin, b2=bmax: mse_r(y_true, y_pred, b1, b2))
    scoring[f'q90_bin_{i}'] = make_scorer(lambda y_true, y_pred, b1=bmin, b2=bmax: q90_r(y_true, y_pred, b1, b2))
    scoring[f'q99_bin_{i}'] = make_scorer(lambda y_true, y_pred, b1=bmin, b2=bmax: q99_r(y_true, y_pred, b1, b2))
    scoring[f'mean_mag_bin_{i}'] = make_scorer(lambda y_true, y_pred, b1=bmin, b2=bmax: mean_mag_r(y_true, y_pred, b1, b2))
x_train = data[['g_mag', 'bp_mag', 'rp_mag']]
y_train = data[['mag_aca_obs']]['mag_aca_obs']
scores_list = []
names = ['simple', 'rf', 'ad_hoc', 'simple_10_3', 'simple_9', 'linear_pca_2', 'linear_pca_3', 'ad_hoc_2']
names = ['simple', 'rf', 'ad_hoc', 'linear_pca_2', 'linear_pca_3']
for name in names:
    print(name)
    n_cv = 10
    pipeline = pipelines[name]['pipeline']()
    cv_scores = cross_validate(pipeline, x_train, y_train, scoring=scoring, cv=n_cv, return_train_score=True)
    cv_scores['name'] = [name] * n_cv
    scores_list.append(cv_scores)
simple rf ad_hoc linear_pca_2 linear_pca_3
In [16]:
scores = pd.DataFrame({key: np.concatenate([s[key] for s in scores_list]) for key in scores_list[0].keys()})
In [17]:
score_summary = pd.DataFrame()
s = scores.groupby('name').mean()
mag_cols = [col for col in scores.columns if re.match('test_mean_mag_bin_', col)]
score_summary['mean_mag'] = s.loc['simple'][mag_cols].values
mse_cols = [col for col in scores.columns if re.match('test_mse_bin_', col)]
q90_cols = [col for col in scores.columns if re.match('test_q90_bin_', col)]
q99_cols = [col for col in scores.columns if re.match('test_q99_bin_', col)]
cols = [('mean_mag', '')]
for name in s.index:
    cols += [(name, 'q90'), (name, 'q99'), (name, 'mse')]
    score_summary[name, 'q90'] = np.sqrt(s.loc[name][q90_cols].values)
    score_summary[name, 'q99'] = np.sqrt(s.loc[name][q99_cols].values)
    score_summary[name, 'mse'] = np.sqrt(s.loc[name][mse_cols].values)
score_summary.columns = pd.MultiIndex.from_tuples(cols)
score_summary
Out[17]:
| mean_mag | ad_hoc | linear_pca_2 | linear_pca_3 | rf | simple | |||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| q90 | q99 | mse | q90 | q99 | mse | q90 | q99 | mse | q90 | q99 | mse | q90 | q99 | mse | ||
| 0 | 5.833 | 0.261 | 0.302 | 0.029 | 0.257 | 0.297 | 0.057 | 0.269 | 0.305 | 0.058 | 0.343 | 0.449 | 0.054 | 0.259 | 0.299 | 0.057 | 
| 1 | 6.293 | 0.246 | 0.347 | 0.028 | 0.244 | 0.342 | 0.052 | 0.252 | 0.354 | 0.052 | 0.280 | 0.536 | 0.050 | 0.245 | 0.345 | 0.052 | 
| 2 | 6.778 | 0.242 | 0.349 | 0.031 | 0.242 | 0.347 | 0.044 | 0.249 | 0.355 | 0.045 | 0.269 | 0.523 | 0.046 | 0.244 | 0.348 | 0.044 | 
| 3 | 7.272 | 0.249 | 0.440 | 0.047 | 0.249 | 0.434 | 0.049 | 0.255 | 0.446 | 0.050 | 0.261 | 0.457 | 0.046 | 0.250 | 0.437 | 0.049 | 
| 4 | 7.774 | 0.253 | 0.344 | 0.022 | 0.252 | 0.342 | 0.023 | 0.257 | 0.353 | 0.024 | 0.266 | 0.461 | 0.033 | 0.253 | 0.345 | 0.023 | 
| 5 | 8.269 | 0.267 | 0.380 | 0.036 | 0.265 | 0.375 | 0.035 | 0.270 | 0.386 | 0.036 | 0.282 | 0.466 | 0.042 | 0.267 | 0.378 | 0.035 | 
| 6 | 8.768 | 0.294 | 0.426 | 0.036 | 0.292 | 0.424 | 0.036 | 0.295 | 0.424 | 0.036 | 0.305 | 0.459 | 0.041 | 0.292 | 0.424 | 0.036 | 
| 7 | 9.250 | 0.345 | 0.488 | 0.055 | 0.345 | 0.490 | 0.057 | 0.346 | 0.485 | 0.056 | 0.356 | 0.494 | 0.055 | 0.345 | 0.489 | 0.056 | 
| 8 | 9.725 | 0.425 | 0.582 | 0.077 | 0.435 | 0.595 | 0.079 | 0.434 | 0.591 | 0.079 | 0.439 | 0.594 | 0.081 | 0.435 | 0.594 | 0.079 | 
| 9 | 10.191 | 0.539 | 0.701 | 0.106 | 0.552 | 0.714 | 0.114 | 0.551 | 0.712 | 0.114 | 0.549 | 0.724 | 0.108 | 0.551 | 0.713 | 0.114 | 
| 10 | 10.680 | 0.796 | 1.198 | 0.297 | 0.822 | 1.225 | 0.303 | 0.822 | 1.225 | 0.303 | 0.819 | 1.185 | 0.298 | 0.822 | 1.225 | 0.303 | 
| 11 | 11.217 | 1.066 | 1.209 | 0.588 | 1.095 | 1.244 | 0.604 | 1.094 | 1.244 | 0.604 | 1.177 | 1.296 | 0.631 | 1.095 | 1.244 | 0.604 | 
In [18]:
fig = plt.figure(figsize=(12, 8))
plt.plot(score_summary['mean_mag'], score_summary['ad_hoc', 'q99'], 'o-', label='ad_hoc')
plt.plot(score_summary['mean_mag'], score_summary['rf', 'q99'], label='rf')
plt.plot(score_summary['mean_mag'], score_summary['simple', 'q99'], label='simple')
plt.plot(score_summary['mean_mag'], score_summary['linear_pca_2', 'q99'], label='PCA2')
plt.plot(score_summary['mean_mag'], score_summary['linear_pca_3', 'q99'], label='PCA3')
plt.legend(loc='upper left')
Out[18]:
<matplotlib.legend.Legend at 0x12f922080>
In [19]:
fig = plt.figure(figsize=(12, 8))
plt.plot(score_summary['mean_mag'], score_summary['ad_hoc', 'q90'], 'o-', label='ad_hoc')
plt.plot(score_summary['mean_mag'], score_summary['rf', 'q90'], label='rf')
plt.plot(score_summary['mean_mag'], score_summary['simple', 'q90'], label='simple')
plt.plot(score_summary['mean_mag'], score_summary['linear_pca_2', 'q90'], label='PCA2')
plt.plot(score_summary['mean_mag'], score_summary['linear_pca_3', 'q90'], label='PCA3')
plt.legend(loc='upper left')
Out[19]:
<matplotlib.legend.Legend at 0x12e65d0c0>
In [20]:
fig = plt.figure(figsize=(12, 8))
plt.plot(score_summary['mean_mag'], score_summary['ad_hoc', 'mse'], 'o-', label='ad_hoc')
plt.plot(score_summary['mean_mag'], score_summary['rf', 'mse'], label='rf')
# plt.plot(score_summary['mean_mag'], score_summary['simple', 'mse'], label='simple')
# plt.plot(score_summary['mean_mag'], score_summary['linear_pca_2', 'mse'], label='PCA2')
plt.plot(score_summary['mean_mag'], score_summary['linear_pca_3', 'mse'], label='PCA3')
plt.legend(loc='upper left')
Out[20]:
<matplotlib.legend.Legend at 0x13037d5a0>
In [21]:
fig = plt.figure(figsize=(12, 8))
plt.plot(score_summary['mean_mag'], score_summary['ad_hoc'], 'o-', label='ad_hoc')
plt.legend(loc='upper left')
Out[21]:
<matplotlib.legend.Legend at 0x13049a9e0>
Fitting One by One¶
In [22]:
def do(name, train, test):
    info = pipelines[name]
    if info['select'] is not None:
        train = info['select'](train)
    pipeline = info['pipeline']()
    args = info['args'](train)
    x_train = train[['g_mag', 'bp_mag', 'rp_mag']]
    y_train = train[['mag_aca_obs']]
    x_test = test[['g_mag', 'bp_mag', 'rp_mag']]
    y_test = test[['mag_aca_obs']]
    pipeline.fit(x_train, y_train['mag_aca_obs'], **args)
    train['y_pred'] = pipeline.predict(x_train)
    test['y_pred'] = pipeline.predict(x_test)
    train['y_res'] = train['y_pred'] - y_train['mag_aca_obs']
    test['y_res'] = test['y_pred'] - y_test['mag_aca_obs']
    
    return pipeline, train, test
def binned_residuals(train):
    residuals = train.groupby('mag_bin')[['ave_gaia_mag', 'y_res']].mean()
    residuals['d_y_res'] = train.groupby('mag_bin')[['y_res']].std()
    residuals['n'] = train.groupby('mag_bin')['y_res'].count()
    residuals = residuals[residuals['n'] > 1]
    return residuals
In [23]:
i = 10
x_val = (data['x_val_sample'] >= i) & (data['x_val_sample'] <= i+1)
train = data[~x_val].reset_index()
test = data[x_val].reset_index()
pipeline = do('ad_hoc', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)
fig, axes = plot_res_v_mag(residuals, t_residuals)
# axes[0].set_ylim((-0.1, 0.1))
In [24]:
i = 10
x_val = (data['x_val_sample'] >= i) & (data['x_val_sample'] <= i+1)
train = data[~x_val].reset_index()
test = data[x_val].reset_index()
pipeline = do('ad_hoc_2', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)
fig, axes = plot_res_v_mag(residuals, t_residuals)
# axes[0].set_ylim((-0.1, 0.1))
In [26]:
train = data[(data['x_val_sample'] > 1)].reset_index()
test = data[(data['x_val_sample'] <= 1)].reset_index()
pipeline = do('ad_hoc_w', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)
fig, axes = plot_res_v_mag(residuals, t_residuals)
axes[0].set_ylim((-0.1, 0.1))
Out[26]:
(-0.1, 0.1)
In [27]:
train = data[(data['x_val_sample'] > 1)].reset_index()
test = data[(data['x_val_sample'] <= 1)].reset_index()
pipeline = do('simple', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)
plot_res_v_mag(residuals, t_residuals)
Out[27]:
(<Figure size 1200x1000 with 2 Axes>,
 array([<AxesSubplot: ylabel='residuals'>,
        <AxesSubplot: xlabel='mean Gaia mag', ylabel='pull'>], dtype=object))
In [28]:
train = data[(data['x_val_sample'] > 1)].reset_index()
test = data[(data['x_val_sample'] <= 1)].reset_index()
pipeline = do('rf', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(test)
plot_res_v_mag(residuals, t_residuals)
Out[28]:
(<Figure size 1200x1000 with 2 Axes>,
 array([<AxesSubplot: ylabel='residuals'>,
        <AxesSubplot: xlabel='mean Gaia mag', ylabel='pull'>], dtype=object))
In [29]:
train = data[(data['x_val_sample'] > 1)].reset_index()
test = data[(data['x_val_sample'] <= 1)].reset_index()
pipeline, train, test = do('simple_10_3', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)
plot_res_v_mag(residuals, t_residuals)
Out[29]:
(<Figure size 1200x1000 with 2 Axes>,
 array([<AxesSubplot: ylabel='residuals'>,
        <AxesSubplot: xlabel='mean Gaia mag', ylabel='pull'>], dtype=object))
In [30]:
i = 6
x_val = (data['x_val_sample'] >= i) & (data['x_val_sample'] <= i+1)
train = data[~x_val].reset_index()
test = data[x_val].reset_index()
pipeline, train, test = do('simple_9', train, test)
x_train = train[['g_mag', 'bp_mag', 'rp_mag']]
y_train = train[['mag_aca_obs']]
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)
fig, axes = plot_res_v_mag(residuals, t_residuals)
def f(x):
    a = 1/2 + np.arctan((x - 9.4) / 0.45)/np.pi
    s = 0.07 + 0.1 * (x - 10)
    return 0.012 + a * s
# plt.ylim((-0.1, 0.2))
x = np.linspace(5, 12, 71)
plt.sca(axes[0])
plt.plot(x, f(x))
Out[30]:
[<matplotlib.lines.Line2D at 0x1307aecb0>]
In [31]:
x_val = (data['x_val_sample'] >= 6) & (data['x_val_sample'] <= 7)
train = data[~x_val].reset_index()
test = data[x_val].reset_index()
pipeline, train, test = do('simple_n2', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)
plot_res_v_mag(residuals, t_residuals)
Out[31]:
(<Figure size 1200x1000 with 2 Axes>,
 array([<AxesSubplot: ylabel='residuals'>,
        <AxesSubplot: xlabel='mean Gaia mag', ylabel='pull'>], dtype=object))
In [32]:
train = data[(data['x_val_sample'] > 1)].reset_index()
test = data[(data['x_val_sample'] <= 1)].reset_index()
pipeline, train, test = do('simple_log', train, test)
residuals = binned_residuals(train)
t_residuals = binned_residuals(test)
# plot_res_scatter(train, test)
# plot_res_1d(train, test)
# plot_res_2d(train)
plot_res_v_mag(residuals, t_residuals)
Out[32]:
(<Figure size 1200x1000 with 2 Axes>,
 array([<AxesSubplot: ylabel='residuals'>,
        <AxesSubplot: xlabel='mean Gaia mag', ylabel='pull'>], dtype=object))
In [ ]: