Likelihood ratio test python

Содержание

rnowling / likelihood_ratio_test.py
Как выполнить тест отношения правдоподобия в Python
Шаг 1: Загрузите данные
Шаг 2: Подгонка моделей регрессии
Шаг 3. Выполните тест логарифмического правдоподобия
Дополнительные ресурсы

rnowling / likelihood_ratio_test.py

This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters

«»»

Licensed under the Apache License, Version 2.0 (the «License»);

you may not use this file except in compliance with the License.

You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software

distributed under the License is distributed on an «AS IS» BASIS,

WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

See the License for the specific language governing permissions and

limitations under the License.

«»»

import numpy as np

from scipy . stats import chi2

from sklearn . linear_model import SGDClassifier

from sklearn . metrics import log_loss

def likelihood_ratio_test ( features_alternate , labels , lr_model , features_null = None ):

«»»

Compute the likelihood ratio test for a model trained on the set of features in

`features_alternate` vs a null model. If `features_null` is not defined, then

the null model simply uses the intercept (class probabilities). Note that

`features_null` must be a subset of `features_alternative` — it can not contain

features that are not in `features_alternate`.

Returns the p-value, which can be used to accept or reject the null hypothesis.

«»»

labels = np . array ( labels )

features_alternate = np . array ( features_alternate )

if features_null :

features_null = np . array ( features_null )

if features_null . shape [ 1 ] >= features_alternate . shape [ 1 ]:

raise ValueError , «Alternate features must have more features than null features»

lr_model . fit ( features_null , labels )

null_prob = lr_model . predict_proba ( features_null )[:, 1 ]

df = features_alternate . shape [ 1 ] — features_null . shape [ 1 ]

else :

null_prob = sum ( labels ) / float ( labels . shape [ 0 ]) * \

np . ones ( labels . shape )

df = features_alternate . shape [ 1 ]

lr_model . fit ( features_alternate , labels )

alt_prob = lr_model . predict_proba ( features_alternate )

alt_log_likelihood = — log_loss ( labels ,

alt_prob ,

normalize = False )

null_log_likelihood = — log_loss ( labels ,

null_prob ,

normalize = False )

G = 2 * ( alt_log_likelihood — null_log_likelihood )

p_value = chi2 . sf ( G , df )

return p_value

Читайте также: Security in php mysql

«»»

Licensed under the Apache License, Version 2.0 (the «License»);

you may not use this file except in compliance with the License.

You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software

distributed under the License is distributed on an «AS IS» BASIS,

WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

See the License for the specific language governing permissions and

limitations under the License.

«»»

import random

import matplotlib . pyplot as plt

import numpy as np

from scipy . stats import chi2

from sklearn . linear_model import SGDClassifier

from sklearn . metrics import log_loss

N_SAMPLES = 100

N_SIMS = 100

CORR_PROBS = [ 1.0 , — 1.0 , 0.95 , — 0.95 , 0.9 , — 0.9 , 0.85 , — 0.85 , 0.8 , — 0.8 , 0.75 , — 0.75 , 0.7 , — 0.7 , 0.65 , — 0.65 , 0.6 , — 0.6 , 0.5 , — 0.5 ]

def generate_binary_data ( n_samples , corr_probs ):

«»»

Generate labels and binary features for data from two classes. The

probabilities given in `corr_probs` determine the probability that a

feature’s value will agree with the sample’s label. A negative

probability indicates that the feature’s value should be the inverse

of the label. For uncorrelated features, use a probability of 0.5.

Returns a vector of labels and matrix of features.

«»»

n_features = len ( corr_probs )

features = np . zeros (( n_samples , n_features ))

labels = np . zeros ( n_samples )

for r in xrange ( n_samples ):

labels [ r ] = random . randint ( 0 , 1 )

for i , p in enumerate ( corr_probs ):

inverted = p < 0.

p = np . abs ( p )

if inverted :

for r in xrange ( n_samples ):

if random . random () < p :

features [ r , i ] = 1 — labels [ r ]

else :

features [ r , i ] = labels [ r ]

else :

for r in xrange ( n_samples ):

if random . random () < p :

features [ r , i ] = labels [ r ]

else :

features [ r , i ] = 1 — labels [ r ]

return labels , features

def needed_sgd_iter ( n_samples ):

«»»

Return number of the number of SGD iterations (epochs) needed

based on the number of samples using advice from

http://scikit-learn.org/stable/modules/sgd.html#tips-on-practical-use

«»»

return max ( 20 ,

int ( np . ceil ( 10 ** 6 / n_samples )))

def likelihood_ratio_test ( features_alternate , labels , lr_model , features_null = None ):

«»»

Compute the likelihood ratio test for a model trained on the set of features in

`features_alternate` vs a null model. If `features_null` is not defined, then

the null model simply uses the intercept (class probabilities). Note that

`features_null` must be a subset of `features_alternative` — it can not contain

features that are not in `features_alternate`.

Returns the p-value, which can be used to accept or reject the null hypothesis.

«»»

labels = np . array ( labels )

features_alternate = np . array ( features_alternate )

if features_null :

features_null = np . array ( features_null )

if features_null . shape [ 1 ] >= features_alternate . shape [ 1 ]:

raise ValueError , «Alternate features must have more features than null features»

lr_model . fit ( features_null , labels )

null_prob = lr_model . predict_proba ( features_null )[:, 1 ]

df = features_alternate . shape [ 1 ] — features_null . shape [ 1 ]

else :

null_prob = sum ( labels ) / float ( labels . shape [ 0 ]) * \

np . ones ( labels . shape )

df = features_alternate . shape [ 1 ]

lr_model . fit ( features_alternate , labels )

alt_prob = lr_model . predict_proba ( features_alternate )

alt_log_likelihood = — log_loss ( labels ,

alt_prob ,

normalize = False )

null_log_likelihood = — log_loss ( labels ,

null_prob ,

normalize = False )

G = 2 * ( alt_log_likelihood — null_log_likelihood )

p_value = chi2 . sf ( G , df )

return p_value

def plot_pvalues ( flname , p_values , title ):

log_p_values = np . log10 ( p_values )

plt . clf ()

plt . boxplot ( x = log_p_values )

plt . xlabel ( «Variable» , fontsize = 16 )

plt . ylabel ( «P-Value (log10)» , fontsize = 16 )

plt . title ( title , fontsize = 18 )

plt . savefig ( flname , DPI = 200 )

if __name__ == «__main__» :

# burn in

for i in xrange ( 100 ):

random . random ()

model = SGDClassifier ( loss = «log» ,

penalty = «l2» ,

n_iter = needed_sgd_iter ( N_SAMPLES ))

print «Feature Details:»

for i in xrange ( len ( CORR_PROBS )):

inverted = CORR_PROBS [ i ] < 0.

print «Feature:» , i , «Corr Prob:» , np . abs ( CORR_PROBS [ i ]), «Inverted:» , inverted

feature_log_p_values = np . zeros (( N_SIMS , len ( CORR_PROBS )))

for j in xrange ( N_SIMS ):

labels , features = generate_binary_data ( N_SAMPLES , CORR_PROBS )

print «Trial:» , ( j + 1 )

for i in xrange ( len ( CORR_PROBS )):

# force into Nx1 matrix

column = features [:, i ]. reshape ( — 1 , 1 )

p_value = likelihood_ratio_test ( column ,

labels ,

model )

feature_log_p_values [ j , i ] = p_value

#inverted = CORR_PROBS[i] < 0.

#print «Feature:», i, «Corr Prob:», np.abs(CORR_PROBS[i]), «Inverted:», inverted, «Likelihood Ratio Test p-value:», p_value

plot_pvalues ( «p_values_boxplot.png» ,

feature_log_p_values ,

«» )

Читайте также: background-position

Источник

Как выполнить тест отношения правдоподобия в Python

Тест отношения правдоподобия сравнивает качество соответствия двух вложенных регрессионных моделей .

Вложенная модель — это просто модель, которая содержит подмножество переменных-предикторов в общей регрессионной модели.

Например, предположим, что у нас есть следующая регрессионная модель с четырьмя переменными-предикторами:

Y = β 0 + β 1 х 1 + β 2 х 2 + β 3 х 3 + β 4 х 4 + ε

Одним из примеров вложенной модели может быть следующая модель только с двумя исходными предикторными переменными:

Y = β 0 + β 1 х 1 + β 2 х 2 + ε

Чтобы определить, существенно ли различаются эти две модели, мы можем выполнить тест отношения правдоподобия, в котором используются следующие нулевая и альтернативная гипотезы:

H 0 : Полная модель и вложенная модель одинаково хорошо соответствуют данным. Таким образом, вы должны использовать вложенную модель .

H A : Полная модель соответствует данным значительно лучше, чем вложенная модель. Таким образом, вы должны использовать полную модель .

Если p-значение теста ниже определенного уровня значимости (например, 0,05), то мы можем отклонить нулевую гипотезу и сделать вывод, что полная модель предлагает значительно лучшее соответствие.

В следующем пошаговом примере показано, как выполнить тест отношения правдоподобия в Python.

Шаг 1: Загрузите данные

В этом примере мы покажем, как подобрать следующие две модели регрессии в Python, используя данные из набора данных mtcars :

Полная модель: миль на галлон = β 0 + β 1 расход + β 2 карбюратор + β 3 л.с. + β 4 цилиндра

Уменьшенная модель: mpg = β 0 + β 1 disp + β 2 carb

Сначала мы загрузим набор данных:

from sklearn. linear_model import LinearRegression import statsmodels.api as sm import pandas as pd import scipy #define URL where dataset is located url = "https://raw.githubusercontent.com/Statology/Python-Guides/main/mtcars.csv" #read in data data = pd.read_csv (url)

Шаг 2: Подгонка моделей регрессии

Во-первых, мы подгоним полную модель и рассчитаем логарифмическую вероятность модели:

#define response variable y1 = data['mpg'] #define predictor variables x1 = data[['disp', 'carb', 'hp', 'cyl']] #add constant to predictor variables x1 = sm.add_constant (x1) #fit regression model full_model = sm. OLS (y1, x1). fit () #calculate log-likelihood of model full_ll = full_model. llf print(full_ll) -77.55789711787898

Затем мы подгоним сокращенную модель и рассчитаем логарифмическую вероятность модели:

#define response variable y2 = data['mpg'] #define predictor variables x2 = data[['disp', 'carb']] #add constant to predictor variables x2 = sm.add_constant (x2) #fit regression model reduced_model = sm. OLS (y2, x2). fit () #calculate log-likelihood of model reduced_ll = reduced_model. llf print(reduced_ll) -78.60301334355185

Шаг 3. Выполните тест логарифмического правдоподобия

Далее мы будем использовать следующий код для выполнения теста логарифмического правдоподобия:

#calculate likelihood ratio Chi-Squared test statistic LR_statistic = -2 \* (reduced_ll-full_ll) print(LR_statistic) 2.0902324513457415 #calculate p-value of test statistic using 2 degrees of freedom p_val = scipy. stats.chi2.sf (LR_statistic, 2) print(p_val) 0.35165094613502257

Из вывода мы видим, что критерий хи-квадрат равен 2,0902 , а соответствующее значение p равно 0,3517 .

Поскольку это p-значение не меньше 0,05, мы не сможем отвергнуть нулевую гипотезу.

Это означает, что полная модель и вложенная модель одинаково хорошо соответствуют данным. Таким образом, мы должны использовать вложенную модель, потому что дополнительные переменные-предикторы в полной модели не обеспечивают значительного улучшения соответствия.

Таким образом, наша окончательная модель будет:

миль на галлон = β 0 + β 1 расход + β 2 углеводов

Примечание.Мы использовали 2 степени свободы при расчете p-значения, потому что это представляло разницу между общими предикторными переменными, используемыми между двумя моделями.

Дополнительные ресурсы

Следующие руководства содержат дополнительную информацию о том, как использовать модели регрессии в Python:

Источник