Skip to content

Commit

Permalink
keep for comparaison
Browse files Browse the repository at this point in the history
Signed-off-by: Maroun Touma <touma@us.ibm.com>
  • Loading branch information
touma-I committed Nov 18, 2024
1 parent 3481667 commit 890a9da
Showing 1 changed file with 25 additions and 60 deletions.
85 changes: 25 additions & 60 deletions transforms/language/html2parquet/notebooks/html2parquet-V0.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,26 @@
"execution_count": 2,
"id": "20663a67-5aa1-4b61-b989-94201613e41f",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"11:47:31 INFO - html2parquet parameters are : {'output_format': <html2parquet_output_format.MARKDOWN: 'markdown'>, 'favor_precision': <html2parquet_favor_precision.TRUE: 'True'>, 'favor_recall': <html2parquet_favor_recall.TRUE: 'True'>}\n",
"11:47:31 INFO - pipeline id pipeline_id\n",
"11:47:31 INFO - code location None\n",
"11:47:31 INFO - data factory data_ is using local data access: input_folder - input output_folder - output\n",
"11:47:31 INFO - data factory data_ max_files -1, n_sample -1\n",
"11:47:31 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.html'], files to checkpoint ['.parquet']\n",
"11:47:31 INFO - orchestrator html2parquet started at 2024-11-18 11:47:31\n",
"11:47:31 INFO - Number of files is 1, source profile {'max_file_size': 0.2035503387451172, 'min_file_size': 0.2035503387451172, 'total_file_size': 0.2035503387451172}\n",
"11:47:31 INFO - Completed 1 files (100.0%) in 0.005 min\n",
"11:47:31 INFO - Done processing 1 files, waiting for flush() completion.\n",
"11:47:31 INFO - done flushing in 0.0 sec\n",
"11:47:31 INFO - Completed execution in 0.005 min, execution result 0\n"
]
}
],
"source": [
"from data_processing.runtime.pure_python import PythonTransformLauncher\n",
"from data_processing.utils import ParamsUtils\n",
Expand Down Expand Up @@ -48,44 +67,6 @@
{
"cell_type": "code",
"execution_count": 3,
"id": "e75f6922-eb0f-4164-a536-f96393e04604",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 4,
"id": "4d2354db-1bb3-4a71-98df-f0f148af3a02",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"11:36:04 INFO - html2parquet parameters are : {'output_format': <html2parquet_output_format.MARKDOWN: 'markdown'>, 'favor_precision': <html2parquet_favor_precision.TRUE: 'True'>, 'favor_recall': <html2parquet_favor_recall.TRUE: 'True'>}\n",
"11:36:04 INFO - pipeline id pipeline_id\n",
"11:36:04 INFO - code location None\n",
"11:36:04 INFO - data factory data_ is using local data access: input_folder - input output_folder - output\n",
"11:36:04 INFO - data factory data_ max_files -1, n_sample -1\n",
"11:36:04 INFO - data factory data_ Not using data sets, checkpointing False, max files -1, random samples -1, files to use ['.html'], files to checkpoint ['.parquet']\n",
"11:36:04 INFO - orchestrator html2parquet started at 2024-11-18 11:36:04\n",
"11:36:04 INFO - Number of files is 1, source profile {'max_file_size': 0.2035503387451172, 'min_file_size': 0.2035503387451172, 'total_file_size': 0.2035503387451172}\n",
"11:36:04 INFO - Completed 1 files (100.0%) in 0.004 min\n",
"11:36:04 INFO - Done processing 1 files, waiting for flush() completion.\n",
"11:36:04 INFO - done flushing in 0.0 sec\n",
"11:36:04 INFO - Completed execution in 0.004 min, execution result 0\n"
]
}
],
"source": [
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e2bee8da-c566-4e45-bca1-354dfd04b0df",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -126,7 +107,7 @@
" <td>![](https://images.prismic.io/ai-alliance/Ztf3...</td>\n",
" <td>f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121...</td>\n",
" <td>394</td>\n",
" <td>2024-11-18T11:36:04.040169</td>\n",
" <td>2024-11-18T11:47:31.262911</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
Expand All @@ -143,10 +124,10 @@
"0 f86b8cebe07ec9f43a351bb4dc897f162f5a88cbb0f121... 394 \n",
"\n",
" date_acquired \n",
"0 2024-11-18T11:36:04.040169 "
"0 2024-11-18T11:47:31.262911 "
]
},
"execution_count": 5,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -160,7 +141,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 4,
"id": "cde6e37d-c437-490f-8e01-f4f51a123484",
"metadata": {},
"outputs": [
Expand All @@ -170,30 +151,14 @@
"'![](https://images.prismic.io/ai-alliance/Ztf3gLzzk9ZrW8v8_caliopensourceslide.jpg?auto=format%2Ccompress&fit=max&w=3840)\\n\\n## Open Source AI Demo Night\\n\\nThe AI Alliance, in collaboration with Cerebral Valley and Ollama, hosted Open Source AI Demo Night in San Francisco, bringing together more than 200+ developers and innovators to showcase and celebrate the latest advances in open-source AI.'"
]
},
"execution_count": 6,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"table.to_pandas()['contents'][0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2fd0d13b-1ff6-4988-91fb-52c25ba998c8",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "587e43ee-7b51-4a9c-8bf2-0a23e309a7ae",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit 890a9da

Please sign in to comment.