注意
跳到最後下載完整的範例程式碼。或者透過 JupyterLite 或 Binder 在您的瀏覽器中執行此範例
離散資料結構上的高斯過程#
此範例說明如何在不具備固定長度特徵向量形式的資料上,使用高斯過程進行迴歸和分類任務。這是透過使用直接在離散結構(例如變長序列、樹狀結構和圖形)上運作的核函數來實現的。
具體來說,此處的輸入變數是一些基因序列,這些基因序列以變長字串形式儲存,包含字母「A」、「T」、「C」和「G」,而輸出變數分別是迴歸任務中的浮點數和分類任務中的 True/False 標籤。
基因序列之間的核函數是透過 R-卷積 [1] 定義的,方法是將二元字母核函數整合到一對字串中所有成對的字母上。
此範例將產生三個圖形。
在第一個圖形中,我們使用色譜圖視覺化核函數的值,即序列的相似度。此處較亮的顏色表示較高的相似度。
在第二個圖形中,我們顯示在 6 個序列資料集上的一些迴歸結果。在這裡,我們使用第 1、2、4 和 5 個序列作為訓練集,以對第 3 和 6 個序列進行預測。
在第三個圖形中,我們示範一個分類模型,該模型在 6 個序列上進行訓練,並對另外 5 個序列進行預測。此處的真實情況很簡單,即序列中是否至少有一個「A」。在這裡,模型進行了四次正確的分類,並且有一次失敗。
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
import numpy as np
from sklearn.base import clone
from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor
from sklearn.gaussian_process.kernels import GenericKernelMixin, Hyperparameter, Kernel
class SequenceKernel(GenericKernelMixin, Kernel):
"""
A minimal (but valid) convolutional kernel for sequences of variable
lengths."""
def __init__(self, baseline_similarity=0.5, baseline_similarity_bounds=(1e-5, 1)):
self.baseline_similarity = baseline_similarity
self.baseline_similarity_bounds = baseline_similarity_bounds
@property
def hyperparameter_baseline_similarity(self):
return Hyperparameter(
"baseline_similarity", "numeric", self.baseline_similarity_bounds
)
def _f(self, s1, s2):
"""
kernel value between a pair of sequences
"""
return sum(
[1.0 if c1 == c2 else self.baseline_similarity for c1 in s1 for c2 in s2]
)
def _g(self, s1, s2):
"""
kernel derivative between a pair of sequences
"""
return sum([0.0 if c1 == c2 else 1.0 for c1 in s1 for c2 in s2])
def __call__(self, X, Y=None, eval_gradient=False):
if Y is None:
Y = X
if eval_gradient:
return (
np.array([[self._f(x, y) for y in Y] for x in X]),
np.array([[[self._g(x, y)] for y in Y] for x in X]),
)
else:
return np.array([[self._f(x, y) for y in Y] for x in X])
def diag(self, X):
return np.array([self._f(x, x) for x in X])
def is_stationary(self):
return False
def clone_with_theta(self, theta):
cloned = clone(self)
cloned.theta = theta
return cloned
kernel = SequenceKernel()
核函數下的序列相似度矩陣#
import matplotlib.pyplot as plt
X = np.array(["AGCT", "AGC", "AACT", "TAA", "AAA", "GAACA"])
K = kernel(X)
D = kernel.diag(X)
plt.figure(figsize=(8, 5))
plt.imshow(np.diag(D**-0.5).dot(K).dot(np.diag(D**-0.5)))
plt.xticks(np.arange(len(X)), X)
plt.yticks(np.arange(len(X)), X)
plt.title("Sequence similarity under the kernel")
plt.show()

迴歸#
X = np.array(["AGCT", "AGC", "AACT", "TAA", "AAA", "GAACA"])
Y = np.array([1.0, 1.0, 2.0, 2.0, 3.0, 3.0])
training_idx = [0, 1, 3, 4]
gp = GaussianProcessRegressor(kernel=kernel)
gp.fit(X[training_idx], Y[training_idx])
plt.figure(figsize=(8, 5))
plt.bar(np.arange(len(X)), gp.predict(X), color="b", label="prediction")
plt.bar(training_idx, Y[training_idx], width=0.2, color="r", alpha=1, label="training")
plt.xticks(np.arange(len(X)), X)
plt.title("Regression on sequences")
plt.legend()
plt.show()

分類#
X_train = np.array(["AGCT", "CGA", "TAAC", "TCG", "CTTT", "TGCT"])
# whether there are 'A's in the sequence
Y_train = np.array([True, True, True, False, False, False])
gp = GaussianProcessClassifier(kernel)
gp.fit(X_train, Y_train)
X_test = ["AAA", "ATAG", "CTC", "CT", "C"]
Y_test = [True, True, False, False, False]
plt.figure(figsize=(8, 5))
plt.scatter(
np.arange(len(X_train)),
[1.0 if c else -1.0 for c in Y_train],
s=100,
marker="o",
edgecolor="none",
facecolor=(1, 0.75, 0),
label="training",
)
plt.scatter(
len(X_train) + np.arange(len(X_test)),
[1.0 if c else -1.0 for c in Y_test],
s=100,
marker="o",
edgecolor="none",
facecolor="r",
label="truth",
)
plt.scatter(
len(X_train) + np.arange(len(X_test)),
[1.0 if c else -1.0 for c in gp.predict(X_test)],
s=100,
marker="x",
facecolor="b",
linewidth=2,
label="prediction",
)
plt.xticks(np.arange(len(X_train) + len(X_test)), np.concatenate((X_train, X_test)))
plt.yticks([-1, 1], [False, True])
plt.title("Classification on sequences")
plt.legend()
plt.show()

/home/circleci/project/sklearn/gaussian_process/kernels.py:442: ConvergenceWarning:
The optimal value found for dimension 0 of parameter baseline_similarity is close to the specified lower bound 1e-05. Decreasing the bound and calling fit again may find a better value.
腳本的總執行時間:(0 分鐘 0.239 秒)
相關範例