Merge branch 'geekan:main' into feat-opt_docs

geekan · Oct 31, 2024 · 2c00034 · 2c00034
2 parents 900efa4 + 3016199
commit 2c00034
Show file tree

Hide file tree

Showing 19 changed files with 2,908 additions and 8 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,4 +15,5 @@ src/*/blog
 src/*/rfcs
 src/*/DataInterpreter
 src/*/demos
+src/*/sela
 src/utils/diff.ts
diff --git a/.vitepress/config.mts b/.vitepress/config.mts
@@ -56,7 +56,7 @@ const genRfcLinks = (dir: string, prefixPath = '') => {
 };
 
 const rfcLinks = genRfcLinks(resolve(__dirname, '../src/rfcs'));
-const sources = ['blog', 'rfcs', 'DataInterpreter'];
+const sources = ['blog', 'rfcs', 'DataInterpreter', 'sela'];
 const dests = ['zh', 'en'];
 
 const copyDir = (source: string, dest: string) => {
@@ -181,9 +181,19 @@ export default defineConfig({
             activeMatch: '/en/guide/',
           },
           {
-            text: 'Data Interpreter',
-            link: '/en/DataInterpreter/index',
-            activeMatch: '/en/DataInterpreter/',
+            text: 'Demo',
+            items: [
+              {
+                text: 'Data Interpreter',
+                link: '/en/DataInterpreter/index',
+                activeMatch: '/en/DataInterpreter/',
+              },
+              // {
+              //   text: 'SELA',
+              //   link: '/en/sale/index',
+              //   activeMatch: '/en/sale/',
+              // },
+            ],
           },
           ...arrVisible(
             [
@@ -453,9 +463,19 @@ export default defineConfig({
             activeMatch: '/zh/guide/',
           },
           {
-            text: 'Data Interpreter',
-            link: '/zh/DataInterpreter/index',
-            activeMatch: '/zh/DataInterpreter/',
+            text: '样例',
+            items: [
+              {
+                text: 'Data Interpreter',
+                link: '/zh/DataInterpreter/index',
+                activeMatch: '/zh/DataInterpreter/',
+              },
+              // {
+              //   text: 'SELA',
+              //   link: '/zh/sale/index',
+              //   activeMatch: '/zh/sale/',
+              // },
+            ],
           },
           ...arrVisible(
             [

diff --git a/package.json b/package.json
@@ -23,6 +23,7 @@
     "@vueuse/core": "^10.6.1",
     "dayjs": "^1.11.10",
     "execa": "^8.0.1",
+    "highlight.js": "^11.10.0",
     "lint-staged": "^15.0.2",
     "prettier": "^3.0.3",
     "sass": "^1.71.1",

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/src/components/demo2/datas/credit-g/tree.json b/src/components/demo2/datas/credit-g/tree.json
diff --git a/src/components/demo2/datas/credit-g/tree_01.json b/src/components/demo2/datas/credit-g/tree_01.json
@@ -0,0 +1,24 @@
+[
+    {
+        "id": "0",
+        "parent_id": null,
+        "avg_score": "73.7",
+        "dev_score": "83.4",
+        "visits": 10,
+        "order": 1,
+        "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.",
+        "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n    print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n    df_copy = df.copy()\n    \n    # Separate target column if it exists\n    if 'class' in df_copy.columns:\n        y = df_copy.pop('class')\n    else:\n        y = None\n    \n    # Scale numerical features\n    if scaler is None:\n        scaler = StandardScaler()\n        df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n    else:\n        df_copy[df_copy.columns] = scaler.transform(df_copy)\n    \n    # Reattach target column if it was separated\n    if y is not None:\n        df_copy['class'] = y\n    \n    return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n",
+        "active": true
+    },
+    {
+        "id": "0-4",
+        "parent_id": "0",
+        "avg_score": "0.0",
+        "dev_score": "0.0",
+        "visits": 0,
+        "order": "1.5",
+        "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.",
+        "code": "",
+        "active": false
+    }
+]
diff --git a/src/components/demo2/datas/credit-g/tree_02.json b/src/components/demo2/datas/credit-g/tree_02.json
@@ -0,0 +1,79 @@
+[
+    {
+        "id": "0",
+        "parent_id": null,
+        "avg_score": "73.7",
+        "dev_score": "83.4",
+        "visits": 10,
+        "order": 1,
+        "instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.",
+        "code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n    print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n    df_copy = df.copy()\n    \n    # Separate target column if it exists\n    if 'class' in df_copy.columns:\n        y = df_copy.pop('class')\n    else:\n        y = None\n    \n    # Scale numerical features\n    if scaler is None:\n        scaler = StandardScaler()\n        df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n    else:\n        df_copy[df_copy.columns] = scaler.transform(df_copy)\n    \n    # Reattach target column if it was separated\n    if y is not None:\n        df_copy['class'] = y\n    \n    return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n",
+        "active": true
+    },
+    {
+        "id": "0-3",
+        "parent_id": "0",
+        "avg_score": "81.6",
+        "dev_score": "81.0",
+        "visits": 4,
+        "order": 2,
+        "instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.",
+        "code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n    df_copy = df.copy()\n    if 'class' in df_copy.columns:\n        y = df_copy.pop('class')\n    else:\n        y = None\n    \n    # Remove ID columns if any\n    df_copy = df_copy.select_dtypes(include=['number'])\n    \n    # Polynomial features\n    if poly is None:\n        poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n        poly_features = poly.fit_transform(df_copy)\n    else:\n        poly_features = poly.transform(df_copy)\n    \n    poly_columns = poly.get_feature_names_out(df_copy.columns)\n    df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n    \n    if y is not None:\n        df_poly['class'] = y\n    \n    return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n",
+        "active": true
+    },
+    {
+        "id": "0-3-0",
+        "parent_id": "0-3",
+        "avg_score": "0.0",
+        "dev_score": "0.0",
+        "visits": 0,
+        "order": "2.5",
+        "instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.",
+        "code": "",
+        "active": false
+    },
+    {
+        "id": "0-3-1",
+        "parent_id": "0-3",
+        "avg_score": "0.0",
+        "dev_score": "0.0",
+        "visits": 0,
+        "order": "2.5",
+        "instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
+        "code": "",
+        "active": false
+    },
+    {
+        "id": "0-3-2",
+        "parent_id": "0-3",
+        "avg_score": "0.0",
+        "dev_score": "0.0",
+        "visits": 0,
+        "order": "2.5",
+        "instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
+        "code": "",
+        "active": false
+    },
+    {
+        "id": "0-3-4",
+        "parent_id": "0-3",
+        "avg_score": "0.0",
+        "dev_score": "0.0",
+        "visits": 0,
+        "order": "2.5",
+        "instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
+        "code": "",
+        "active": false
+    },
+    {
+        "id": "0-4",
+        "parent_id": "0",
+        "avg_score": "0.0",
+        "dev_score": "0.0",
+        "visits": 0,
+        "order": "1.5",
+        "instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.",
+        "code": "",
+        "active": false
+    }
+]