Skip to content

Commit

Permalink
Merge branch 'geekan:main' into feat-opt_docs
Browse files Browse the repository at this point in the history
  • Loading branch information
HuiDBK authored Oct 31, 2024
2 parents 900efa4 + 3016199 commit 2c00034
Show file tree
Hide file tree
Showing 19 changed files with 2,908 additions and 8 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ src/*/blog
src/*/rfcs
src/*/DataInterpreter
src/*/demos
src/*/sela
src/utils/diff.ts
34 changes: 27 additions & 7 deletions .vitepress/config.mts
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ const genRfcLinks = (dir: string, prefixPath = '') => {
};

const rfcLinks = genRfcLinks(resolve(__dirname, '../src/rfcs'));
const sources = ['blog', 'rfcs', 'DataInterpreter'];
const sources = ['blog', 'rfcs', 'DataInterpreter', 'sela'];
const dests = ['zh', 'en'];

const copyDir = (source: string, dest: string) => {
Expand Down Expand Up @@ -181,9 +181,19 @@ export default defineConfig({
activeMatch: '/en/guide/',
},
{
text: 'Data Interpreter',
link: '/en/DataInterpreter/index',
activeMatch: '/en/DataInterpreter/',
text: 'Demo',
items: [
{
text: 'Data Interpreter',
link: '/en/DataInterpreter/index',
activeMatch: '/en/DataInterpreter/',
},
// {
// text: 'SELA',
// link: '/en/sale/index',
// activeMatch: '/en/sale/',
// },
],
},
...arrVisible(
[
Expand Down Expand Up @@ -453,9 +463,19 @@ export default defineConfig({
activeMatch: '/zh/guide/',
},
{
text: 'Data Interpreter',
link: '/zh/DataInterpreter/index',
activeMatch: '/zh/DataInterpreter/',
text: '样例',
items: [
{
text: 'Data Interpreter',
link: '/zh/DataInterpreter/index',
activeMatch: '/zh/DataInterpreter/',
},
// {
// text: 'SELA',
// link: '/zh/sale/index',
// activeMatch: '/zh/sale/',
// },
],
},
...arrVisible(
[
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"@vueuse/core": "^10.6.1",
"dayjs": "^1.11.10",
"execa": "^8.0.1",
"highlight.js": "^11.10.0",
"lint-staged": "^15.0.2",
"prettier": "^3.0.3",
"sass": "^1.71.1",
Expand Down
8 changes: 8 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

262 changes: 262 additions & 0 deletions src/components/demo2/datas/credit-g/tree.json

Large diffs are not rendered by default.

24 changes: 24 additions & 0 deletions src/components/demo2/datas/credit-g/tree_01.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
[
{
"id": "0",
"parent_id": null,
"avg_score": "73.7",
"dev_score": "83.4",
"visits": 10,
"order": 1,
"instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.",
"code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n",
"active": true
},
{
"id": "0-4",
"parent_id": "0",
"avg_score": "0.0",
"dev_score": "0.0",
"visits": 0,
"order": "1.5",
"instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.",
"code": "",
"active": false
}
]
79 changes: 79 additions & 0 deletions src/components/demo2/datas/credit-g/tree_02.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
[
{
"id": "0",
"parent_id": null,
"avg_score": "73.7",
"dev_score": "83.4",
"visits": 10,
"order": 1,
"instruction": "Perform exploratory data analysis on the train and dev sets to understand the dataset structure and characteristics.\n\nPreprocess the train, dev, and test sets by handling missing values, encoding categorical variables, and scaling numerical features.",
"code": "import pandas as pd\nimport numpy as np\n\n# Load the dataset\ntrain_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_train.csv')\ndev_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_dev.csv')\n\n# Display basic information about the datasets\nprint(\"Train Dataset Info:\")\nprint(train_df.info())\nprint(\"\\nDev Dataset Info:\")\nprint(dev_df.info())\n\n# Display summary statistics for numerical columns\nprint(\"\\nTrain Dataset Summary Statistics:\")\nprint(train_df.describe())\nprint(\"\\nDev Dataset Summary Statistics:\")\nprint(dev_df.describe())\n\n# Display the number of unique values for categorical columns\ncategorical_columns = train_df.select_dtypes(include=['object', 'category']).columns\nprint(\"\\nUnique Values in Categorical Columns:\")\nfor col in categorical_columns:\n print(f\"{col}: {train_df[col].nunique()} unique values\")\n\n# Check for missing values\nprint(\"\\nMissing Values in Train Dataset:\")\nprint(train_df.isnull().sum())\nprint(\"\\nMissing Values in Dev Dataset:\")\nprint(dev_df.isnull().sum())\n\n# Check the distribution of the target column\nprint(\"\\nTarget Column Distribution in Train Dataset:\")\nprint(train_df['class'].value_counts(normalize=True))\nprint(\"\\nTarget Column Distribution in Dev Dataset:\")\nprint(dev_df['class'].value_counts(normalize=True))\n\n\nfrom sklearn.preprocessing import StandardScaler\n\n# Function to preprocess data\ndef preprocess_data(df, scaler=None):\n df_copy = df.copy()\n \n # Separate target column if it exists\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Scale numerical features\n if scaler is None:\n scaler = StandardScaler()\n df_copy[df_copy.columns] = scaler.fit_transform(df_copy)\n else:\n df_copy[df_copy.columns] = scaler.transform(df_copy)\n \n # Reattach target column if it was separated\n if y is not None:\n df_copy['class'] = y\n \n return df_copy, scaler\n\n# Preprocess train, dev, and test sets\ntrain_df_processed, scaler = preprocess_data(train_df)\ndev_df_processed, _ = preprocess_data(dev_df, scaler)\ntest_df = pd.read_csv('/data/chiyizhou/datasets/jasmine/split_test_wo_target.csv')\ntest_df_processed, _ = preprocess_data(test_df, scaler)\n\n# Display the first few rows of the processed train dataset\nprint(\"Processed Train Dataset:\")\nprint(train_df_processed.head())\n",
"active": true
},
{
"id": "0-3",
"parent_id": "0",
"avg_score": "81.6",
"dev_score": "81.0",
"visits": 4,
"order": 2,
"instruction": "Analyze the frequency distribution of categorical features to understand their diversity and potential impact on the model. Use this analysis to engineer new features or transform existing ones that can better capture the underlying patterns in the data.",
"code": "from sklearn.preprocessing import PolynomialFeatures\n\ndef engineer_features(df, poly=None):\n df_copy = df.copy()\n if 'class' in df_copy.columns:\n y = df_copy.pop('class')\n else:\n y = None\n \n # Remove ID columns if any\n df_copy = df_copy.select_dtypes(include=['number'])\n \n # Polynomial features\n if poly is None:\n poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)\n poly_features = poly.fit_transform(df_copy)\n else:\n poly_features = poly.transform(df_copy)\n \n poly_columns = poly.get_feature_names_out(df_copy.columns)\n df_poly = pd.DataFrame(poly_features, columns=poly_columns)\n \n if y is not None:\n df_poly['class'] = y\n \n return df_poly, poly\n\ntrain_df_poly, poly = engineer_features(train_df_processed)\ndev_df_poly, _ = engineer_features(dev_df_processed, poly)\ntest_df_poly, _ = engineer_features(test_df_processed, poly)\n\nprint(\"Processed Train Dataset with Polynomial Features:\")\nprint(train_df_poly.head())\n",
"active": true
},
{
"id": "0-3-0",
"parent_id": "0-3",
"avg_score": "0.0",
"dev_score": "0.0",
"visits": 0,
"order": "2.5",
"instruction": "Train a base model to predict the target column 'class' on the train set after performing a correlation analysis to identify highly correlated features and address potential multicollinearity issues.",
"code": "",
"active": false
},
{
"id": "0-3-1",
"parent_id": "0-3",
"avg_score": "0.0",
"dev_score": "0.0",
"visits": 0,
"order": "2.5",
"instruction": "Train a base model to predict the target column 'class' on the train set. First, analyze the distribution of the target variable to check for class imbalance. If class imbalance is detected, apply appropriate sampling techniques such as oversampling the minority class or undersampling the majority class to balance the dataset before training the model.",
"code": "",
"active": false
},
{
"id": "0-3-2",
"parent_id": "0-3",
"avg_score": "0.0",
"dev_score": "0.0",
"visits": 0,
"order": "2.5",
"instruction": "Train a base model to predict the target column 'class' on the train set after visualizing the distribution of numerical features to identify and handle outliers and skewed distributions.",
"code": "",
"active": false
},
{
"id": "0-3-4",
"parent_id": "0-3",
"avg_score": "0.0",
"dev_score": "0.0",
"visits": 0,
"order": "2.5",
"instruction": "Train a base model to predict the target column 'class' on the train set after performing PCA (Principal Component Analysis) to reduce dimensionality and understand the variance captured by different components.",
"code": "",
"active": false
},
{
"id": "0-4",
"parent_id": "0",
"avg_score": "0.0",
"dev_score": "0.0",
"visits": 0,
"order": "1.5",
"instruction": "Perform Principal Component Analysis (PCA) to reduce dimensionality and understand the variance captured by different components, then engineer features based on the most significant principal components to improve model performance.",
"code": "",
"active": false
}
]
Loading

0 comments on commit 2c00034

Please sign in to comment.