|
2 | 2 | "cells": [
|
3 | 3 | {
|
4 | 4 | "cell_type": "markdown",
|
5 |
| - "metadata": {}, |
| 5 | + "metadata": { |
| 6 | + "id": "0o5UQ-Gy2Xvr" |
| 7 | + }, |
6 | 8 | "source": [
|
7 | 9 | "\n",
|
8 | 10 | "\n",
|
|
58 | 60 | },
|
59 | 61 | {
|
60 | 62 | "cell_type": "code",
|
61 |
| - "execution_count": null, |
62 |
| - "metadata": {}, |
| 63 | + "execution_count": 1, |
| 64 | + "metadata": { |
| 65 | + "id": "xrWTskQJ2Xv5" |
| 66 | + }, |
63 | 67 | "outputs": [],
|
64 | 68 | "source": [
|
65 | 69 | "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
|
66 | 70 | ]
|
67 | 71 | },
|
68 | 72 | {
|
69 | 73 | "cell_type": "markdown",
|
70 |
| - "metadata": {}, |
| 74 | + "metadata": { |
| 75 | + "id": "9B98jlOn2Xv8" |
| 76 | + }, |
71 | 77 | "source": [
|
72 | 78 | "For local files example we will download a TXT file from Spark NLP Github repo:"
|
73 | 79 | ]
|
74 | 80 | },
|
75 | 81 | {
|
76 | 82 | "cell_type": "code",
|
77 |
| - "execution_count": 0, |
| 83 | + "execution_count": 11, |
78 | 84 | "metadata": {
|
79 | 85 | "application/vnd.databricks.v1+cell": {
|
80 | 86 | "cellMetadata": {
|
|
91 | 97 | "base_uri": "https://localhost:8080/"
|
92 | 98 | },
|
93 | 99 | "id": "ya8qZe00dalC",
|
94 |
| - "outputId": "268ccacb-ba1c-4753-f251-014fb0003f38" |
| 100 | + "outputId": "144186be-781d-451b-894e-d9c590a93c6a" |
95 | 101 | },
|
96 |
| - "outputs": [], |
| 102 | + "outputs": [ |
| 103 | + { |
| 104 | + "name": "stdout", |
| 105 | + "output_type": "stream", |
| 106 | + "text": [ |
| 107 | + "mkdir: cannot create directory ‘txt-files’: File exists\n", |
| 108 | + "--2025-03-07 00:33:21-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1113-Adding-support-to-enhance-read-TXT-files/src/test/resources/reader/txt/simple-text.txt\n", |
| 109 | + "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n", |
| 110 | + "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", |
| 111 | + "HTTP request sent, awaiting response... 200 OK\n", |
| 112 | + "Length: 300 [text/plain]\n", |
| 113 | + "Saving to: ‘txt-files/simple-text.txt’\n", |
| 114 | + "\n", |
| 115 | + "simple-text.txt 100%[===================>] 300 --.-KB/s in 0s \n", |
| 116 | + "\n", |
| 117 | + "2025-03-07 00:33:21 (4.67 MB/s) - ‘txt-files/simple-text.txt’ saved [300/300]\n", |
| 118 | + "\n" |
| 119 | + ] |
| 120 | + } |
| 121 | + ], |
97 | 122 | "source": [
|
98 | 123 | "!mkdir txt-files\n",
|
99 |
| - "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt -P txt-files" |
| 124 | + "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1113-Adding-support-to-enhance-read-TXT-files/src/test/resources/reader/txt/simple-text.txt -P txt-files" |
100 | 125 | ]
|
101 | 126 | },
|
102 | 127 | {
|
|
122 | 147 | },
|
123 | 148 | {
|
124 | 149 | "cell_type": "code",
|
125 |
| - "execution_count": 0, |
| 150 | + "execution_count": 12, |
126 | 151 | "metadata": {
|
127 | 152 | "application/vnd.databricks.v1+cell": {
|
128 | 153 | "cellMetadata": {
|
|
139 | 164 | "base_uri": "https://localhost:8080/"
|
140 | 165 | },
|
141 | 166 | "id": "bAkMjJ1vdalE",
|
142 |
| - "outputId": "a0a2e727-fcc3-474b-eaaa-20bf15f19773" |
| 167 | + "outputId": "74f0e218-6378-4df4-9b12-3ee6e33020e6" |
143 | 168 | },
|
144 | 169 | "outputs": [
|
145 | 170 | {
|
146 | 171 | "name": "stdout",
|
147 | 172 | "output_type": "stream",
|
148 | 173 | "text": [
|
149 | 174 | "Warning::Spark Session already created, some configs may not take.\n",
|
150 |
| - "+--------------------+--------------------+--------------------+\n", |
151 |
| - "| path| content| txt|\n", |
152 |
| - "+--------------------+--------------------+--------------------+\n", |
153 |
| - "|dbfs:/danilo/data...|BIG DATA ANALYTIC...|[{Title, BIG DATA...|\n", |
154 |
| - "+--------------------+--------------------+--------------------+\n", |
| 175 | + "+--------------------+--------------------+\n", |
| 176 | + "| path| txt|\n", |
| 177 | + "+--------------------+--------------------+\n", |
| 178 | + "|file:/content/txt...|[{Title, BIG DATA...|\n", |
| 179 | + "+--------------------+--------------------+\n", |
155 | 180 | "\n"
|
156 | 181 | ]
|
157 | 182 | }
|
158 | 183 | ],
|
159 | 184 | "source": [
|
160 | 185 | "import sparknlp\n",
|
161 |
| - "txt_df = sparknlp.read().txt(\"dbfs:/danilo/datasets/txt\")\n", |
162 | 186 | "\n",
|
| 187 | + "txt_df = sparknlp.read().txt(\"./txt-files\")\n", |
163 | 188 | "txt_df.show()"
|
164 | 189 | ]
|
165 | 190 | },
|
166 | 191 | {
|
167 | 192 | "cell_type": "code",
|
168 |
| - "execution_count": 0, |
| 193 | + "execution_count": 13, |
169 | 194 | "metadata": {
|
170 | 195 | "application/vnd.databricks.v1+cell": {
|
171 | 196 | "cellMetadata": {
|
|
182 | 207 | "base_uri": "https://localhost:8080/"
|
183 | 208 | },
|
184 | 209 | "id": "4iky1gvEz7Pt",
|
185 |
| - "outputId": "a986947b-f874-46bc-88c8-093dc42c83cb" |
| 210 | + "outputId": "ead23526-18be-4bb9-e952-38ef3d483cb0" |
186 | 211 | },
|
187 | 212 | "outputs": [
|
188 | 213 | {
|
|
201 | 226 | "source": [
|
202 | 227 | "txt_df.select(\"txt\").show(truncate=False)"
|
203 | 228 | ]
|
| 229 | + }, |
| 230 | + { |
| 231 | + "cell_type": "markdown", |
| 232 | + "metadata": { |
| 233 | + "id": "brto-6NX2wLT" |
| 234 | + }, |
| 235 | + "source": [ |
| 236 | + "You can also use DFS file systems like:\n", |
| 237 | + "- Databricks: `dbfs://`\n", |
| 238 | + "- HDFS: `hdfs://`\n", |
| 239 | + "- Microsoft Fabric OneLake: `abfss://`" |
| 240 | + ] |
| 241 | + }, |
| 242 | + { |
| 243 | + "cell_type": "markdown", |
| 244 | + "metadata": { |
| 245 | + "id": "CYnoVMVD211Z" |
| 246 | + }, |
| 247 | + "source": [ |
| 248 | + "### Configuration Parameters" |
| 249 | + ] |
| 250 | + }, |
| 251 | + { |
| 252 | + "cell_type": "markdown", |
| 253 | + "metadata": { |
| 254 | + "id": "rJhyeem_3Gqh" |
| 255 | + }, |
| 256 | + "source": [ |
| 257 | + "- `titleLengthSize`: You can customize the font size used to identify titles that should be treated as titles. By default, the font size is set to 50. However, if your text files require a different configuration, you can adjust this parameter accordingly. The example below demonstrates how to modify and work with this setting:" |
| 258 | + ] |
| 259 | + }, |
| 260 | + { |
| 261 | + "cell_type": "code", |
| 262 | + "execution_count": 19, |
| 263 | + "metadata": { |
| 264 | + "colab": { |
| 265 | + "base_uri": "https://localhost:8080/" |
| 266 | + }, |
| 267 | + "id": "nLUtWTk-3jcT", |
| 268 | + "outputId": "60d10ba0-cf91-4706-efb4-4e640d7e6bb0" |
| 269 | + }, |
| 270 | + "outputs": [ |
| 271 | + { |
| 272 | + "name": "stdout", |
| 273 | + "output_type": "stream", |
| 274 | + "text": [ |
| 275 | + "Warning::Spark Session already created, some configs may not take.\n", |
| 276 | + "+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", |
| 277 | + "|path |txt |\n", |
| 278 | + "+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", |
| 279 | + "|file:/content/txt-files/simple-text.txt|[{NarrativeText, BIG DATA ANALYTICS, {paragraph -> 0}}, {NarrativeText, Apache Spark is a fast and general-purpose cluster computing system.\\nIt provides high-level APIs in Java, Scala, Python, and R., {paragraph -> 0}}, {NarrativeText, MACHINE LEARNING, {paragraph -> 1}}, {NarrativeText, Spark's MLlib provides scalable machine learning algorithms.\\nIt includes tools for classification, regression, clustering, and more., {paragraph -> 1}}]|\n", |
| 280 | + "+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", |
| 281 | + "\n" |
| 282 | + ] |
| 283 | + } |
| 284 | + ], |
| 285 | + "source": [ |
| 286 | + "params = {\"titleLengthSize\": \"5\"}\n", |
| 287 | + "txt_df = sparknlp.read(params).txt(\"./txt-files\")\n", |
| 288 | + "txt_df.show(truncate=False)" |
| 289 | + ] |
| 290 | + }, |
| 291 | + { |
| 292 | + "cell_type": "markdown", |
| 293 | + "metadata": { |
| 294 | + "id": "d444S-MK239M" |
| 295 | + }, |
| 296 | + "source": [ |
| 297 | + "- `storeContent`: By default, this is set to `false`. When enabled, the output will include the raw content of the file." |
| 298 | + ] |
| 299 | + }, |
| 300 | + { |
| 301 | + "cell_type": "code", |
| 302 | + "execution_count": 18, |
| 303 | + "metadata": { |
| 304 | + "colab": { |
| 305 | + "base_uri": "https://localhost:8080/" |
| 306 | + }, |
| 307 | + "id": "optYF_SS22TW", |
| 308 | + "outputId": "e21f8dab-ef69-432b-aa3e-fb0afc075bbb" |
| 309 | + }, |
| 310 | + "outputs": [ |
| 311 | + { |
| 312 | + "name": "stdout", |
| 313 | + "output_type": "stream", |
| 314 | + "text": [ |
| 315 | + "Warning::Spark Session already created, some configs may not take.\n", |
| 316 | + "+--------------------+--------------------+--------------------+\n", |
| 317 | + "| path| txt| content|\n", |
| 318 | + "+--------------------+--------------------+--------------------+\n", |
| 319 | + "|file:/content/txt...|[{Title, BIG DATA...|BIG DATA ANALYTIC...|\n", |
| 320 | + "+--------------------+--------------------+--------------------+\n", |
| 321 | + "\n" |
| 322 | + ] |
| 323 | + } |
| 324 | + ], |
| 325 | + "source": [ |
| 326 | + "params = {\"storeContent\": \"true\"}\n", |
| 327 | + "txt_df = sparknlp.read(params).txt(\"./txt-files\")\n", |
| 328 | + "txt_df.show()" |
| 329 | + ] |
204 | 330 | }
|
205 | 331 | ],
|
206 | 332 | "metadata": {
|
|
0 commit comments