Open t-SNE | Notion

Untitled

MNIST 데이터 10000개 추출후, PCA로 30차원까지 줄였음.

perplexity를 바꿔가며 수행
n-jobs를 바꿔가면서 수행
exaggeration을 바꿔가며 수행
affinity를 바꿔가며 수행
dof 를 바꿔가며 수행

데이터를 1/3씩 추가

Untitled

from tensorflow.keras.datasets import fashion_mnist
import matplotlib.pyplot as plt
import openTSNE
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

def plot_tsne_embedding(data, labels, perplexity=30, n_jobs=8, random_state=42, exaggeration=1.5, title_suffix=""):
    # 기본 파라미터 값
    default_params = {
        'perplexity': 30,
        'n_jobs': 8,
        'random_state': 42,
        'exaggeration': 1.5
    }

    # 입력된 파라미터와 기본 파라미터 비교
    params_used = {
        'perplexity': perplexity,
        'n_jobs': n_jobs,
        'random_state': random_state,
        'exaggeration': exaggeration
    }
    
    changed_params = {k: v for k, v in params_used.items() if v != default_params[k]}
    print(f"Changed parameters: {changed_params}")

    #affinity생성
    affinities = openTSNE.affinity.PerplexityBasedNN(
        data,
        perplexity=perplexity,
        metric="euclidean",
        n_jobs=n_jobs,
        random_state=random_state,
        verbose=True,
    )
    #initialization 생성
    initialization = openTSNE.initialization.random(data, random_state=random_state)
    
    # 초기절대좌표
    print("Initial coordinates: ", initialization)
		#임베딩 생성
    embedding = openTSNE.TSNEEmbedding(
        initialization,
        affinities,
        negative_gradient_method="fft",
        n_jobs=n_jobs,
        verbose=True
    )
    embedding.optimize(250, exaggeration=exaggeration, inplace=True)

    # 그래프이름
    title = 't-SNE Embedding - ' + title_suffix
    if changed_params:
        title += ' ('
        title += ', '.join([f"{k}={v}" for k, v in changed_params.items()])
        title += ')'

    # plotting
    plt.figure(figsize=(8, 6))
    plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap='viridis', s=1.5)  # 점 크기를 작게 설정
    plt.colorbar(label='Class Label')
    plt.title(title)
    plt.xlabel('t-SNE Feature 1')
    plt.ylabel('t-SNE Feature 2')
    plt.show()
    return embedding

# Fashion MNIST 데이터셋 로드
(x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

# 데이터 전처리
x_all = np.concatenate((x_train, x_test))
y_all = np.concatenate((y_train, y_test))

# 이미지를 1차원 벡터로 변환
x_all = x_all.reshape((x_all.shape[0], -1))

# 10,000개의 데이터 샘플을 랜덤으로 선택
x_sam, _, y_sample, _ = train_test_split(x_all, y_all, train_size=10000, stratify=y_all, random_state=42)
# PCA로 차원 축소 (784차원 -> 30차원)
pca = PCA(n_components=30, random_state=42)
x_sample = pca.fit_transform(x_sam)

# 데이터 나누기
third_index = len(x_sample) // 3
data_part1 = x_sample[:third_index]
labels_part1 = y_sample[:third_index]
data_part2 = x_sample[:2*third_index]
labels_part2 = y_sample[:2*third_index]
data_part3 = x_sample
labels_part3 = y_sample

	# 첫 번째 1/3 부분에 대해 embedding 생성 및 시각화
embedding_1 = plot_tsne_embedding(data_part1, labels_part1, title_suffix="1/3 of the Data Set")

# 두 번째 2/3부분에 대해 partialEmbedding 생성 및 시각화
embedding_2 = embedding_1.transform(data_part2)
plt.figure(figsize=(8, 6))
plt.scatter(embedding_2[:, 0], embedding_2[:, 1], c=labels_part2, cmap='viridis', s=1.5)  # 점 크기를 작게 설정
plt.colorbar(label='Class Label')
plt.title("2/3 of the Data Set")
plt.xlabel('t-SNE Feature 1')
plt.ylabel('t-SNE Feature 2')
plt.show()

# 세 번째 전체 부분에 대해 partial Embedding 생성 및 시각화
embedding_3 = embedding_1.transform(data_part3)
plt.figure(figsize=(8, 6))
plt.scatter(embedding_3[:, 0], embedding_3[:, 1], c=labels_part3, cmap='viridis', s=1.5)  # 점 크기를 작게 설정
plt.colorbar(label='Class Label')
plt.title("Whole Data set")
plt.xlabel('t-SNE Feature 1')
plt.ylabel('t-SNE Feature 2')
plt.show()

Dataset: Digits

No transformation. 여전히 global structure 유지 x

Untitled

Transformation 사용

Untitled

Time elapsed: 27.08 seconds

Time elapsed: 0.25 seconds

Time elapsed: 0.31 seconds