Skip to content

Commit cdb8f36

Browse files
committed
[SPARKNLP-1117] Adding storeContent param
1 parent 30502cc commit cdb8f36

File tree

5 files changed

+174
-22
lines changed

5 files changed

+174
-22
lines changed

examples/python/reader/SparkNLP_TXT_Reader_Demo.ipynb

+144-18
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
"cells": [
33
{
44
"cell_type": "markdown",
5-
"metadata": {},
5+
"metadata": {
6+
"id": "0o5UQ-Gy2Xvr"
7+
},
68
"source": [
79
"![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n",
810
"\n",
@@ -58,23 +60,27 @@
5860
},
5961
{
6062
"cell_type": "code",
61-
"execution_count": null,
62-
"metadata": {},
63+
"execution_count": 1,
64+
"metadata": {
65+
"id": "xrWTskQJ2Xv5"
66+
},
6367
"outputs": [],
6468
"source": [
6569
"! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
6670
]
6771
},
6872
{
6973
"cell_type": "markdown",
70-
"metadata": {},
74+
"metadata": {
75+
"id": "9B98jlOn2Xv8"
76+
},
7177
"source": [
7278
"For local files example we will download a TXT file from Spark NLP Github repo:"
7379
]
7480
},
7581
{
7682
"cell_type": "code",
77-
"execution_count": 0,
83+
"execution_count": 11,
7884
"metadata": {
7985
"application/vnd.databricks.v1+cell": {
8086
"cellMetadata": {
@@ -91,12 +97,31 @@
9197
"base_uri": "https://localhost:8080/"
9298
},
9399
"id": "ya8qZe00dalC",
94-
"outputId": "268ccacb-ba1c-4753-f251-014fb0003f38"
100+
"outputId": "144186be-781d-451b-894e-d9c590a93c6a"
95101
},
96-
"outputs": [],
102+
"outputs": [
103+
{
104+
"name": "stdout",
105+
"output_type": "stream",
106+
"text": [
107+
"mkdir: cannot create directory ‘txt-files’: File exists\n",
108+
"--2025-03-07 00:33:21-- https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1113-Adding-support-to-enhance-read-TXT-files/src/test/resources/reader/txt/simple-text.txt\n",
109+
"Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n",
110+
"Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
111+
"HTTP request sent, awaiting response... 200 OK\n",
112+
"Length: 300 [text/plain]\n",
113+
"Saving to: ‘txt-files/simple-text.txt’\n",
114+
"\n",
115+
"simple-text.txt 100%[===================>] 300 --.-KB/s in 0s \n",
116+
"\n",
117+
"2025-03-07 00:33:21 (4.67 MB/s) - ‘txt-files/simple-text.txt’ saved [300/300]\n",
118+
"\n"
119+
]
120+
}
121+
],
97122
"source": [
98123
"!mkdir txt-files\n",
99-
"!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt -P txt-files"
124+
"!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1113-Adding-support-to-enhance-read-TXT-files/src/test/resources/reader/txt/simple-text.txt -P txt-files"
100125
]
101126
},
102127
{
@@ -122,7 +147,7 @@
122147
},
123148
{
124149
"cell_type": "code",
125-
"execution_count": 0,
150+
"execution_count": 12,
126151
"metadata": {
127152
"application/vnd.databricks.v1+cell": {
128153
"cellMetadata": {
@@ -139,33 +164,33 @@
139164
"base_uri": "https://localhost:8080/"
140165
},
141166
"id": "bAkMjJ1vdalE",
142-
"outputId": "a0a2e727-fcc3-474b-eaaa-20bf15f19773"
167+
"outputId": "74f0e218-6378-4df4-9b12-3ee6e33020e6"
143168
},
144169
"outputs": [
145170
{
146171
"name": "stdout",
147172
"output_type": "stream",
148173
"text": [
149174
"Warning::Spark Session already created, some configs may not take.\n",
150-
"+--------------------+--------------------+--------------------+\n",
151-
"| path| content| txt|\n",
152-
"+--------------------+--------------------+--------------------+\n",
153-
"|dbfs:/danilo/data...|BIG DATA ANALYTIC...|[{Title, BIG DATA...|\n",
154-
"+--------------------+--------------------+--------------------+\n",
175+
"+--------------------+--------------------+\n",
176+
"| path| txt|\n",
177+
"+--------------------+--------------------+\n",
178+
"|file:/content/txt...|[{Title, BIG DATA...|\n",
179+
"+--------------------+--------------------+\n",
155180
"\n"
156181
]
157182
}
158183
],
159184
"source": [
160185
"import sparknlp\n",
161-
"txt_df = sparknlp.read().txt(\"dbfs:/danilo/datasets/txt\")\n",
162186
"\n",
187+
"txt_df = sparknlp.read().txt(\"./txt-files\")\n",
163188
"txt_df.show()"
164189
]
165190
},
166191
{
167192
"cell_type": "code",
168-
"execution_count": 0,
193+
"execution_count": 13,
169194
"metadata": {
170195
"application/vnd.databricks.v1+cell": {
171196
"cellMetadata": {
@@ -182,7 +207,7 @@
182207
"base_uri": "https://localhost:8080/"
183208
},
184209
"id": "4iky1gvEz7Pt",
185-
"outputId": "a986947b-f874-46bc-88c8-093dc42c83cb"
210+
"outputId": "ead23526-18be-4bb9-e952-38ef3d483cb0"
186211
},
187212
"outputs": [
188213
{
@@ -201,6 +226,107 @@
201226
"source": [
202227
"txt_df.select(\"txt\").show(truncate=False)"
203228
]
229+
},
230+
{
231+
"cell_type": "markdown",
232+
"metadata": {
233+
"id": "brto-6NX2wLT"
234+
},
235+
"source": [
236+
"You can also use DFS file systems like:\n",
237+
"- Databricks: `dbfs://`\n",
238+
"- HDFS: `hdfs://`\n",
239+
"- Microsoft Fabric OneLake: `abfss://`"
240+
]
241+
},
242+
{
243+
"cell_type": "markdown",
244+
"metadata": {
245+
"id": "CYnoVMVD211Z"
246+
},
247+
"source": [
248+
"### Configuration Parameters"
249+
]
250+
},
251+
{
252+
"cell_type": "markdown",
253+
"metadata": {
254+
"id": "rJhyeem_3Gqh"
255+
},
256+
"source": [
257+
"- `titleLengthSize`: You can customize the font size used to identify titles that should be treated as titles. By default, the font size is set to 50. However, if your text files require a different configuration, you can adjust this parameter accordingly. The example below demonstrates how to modify and work with this setting:"
258+
]
259+
},
260+
{
261+
"cell_type": "code",
262+
"execution_count": 19,
263+
"metadata": {
264+
"colab": {
265+
"base_uri": "https://localhost:8080/"
266+
},
267+
"id": "nLUtWTk-3jcT",
268+
"outputId": "60d10ba0-cf91-4706-efb4-4e640d7e6bb0"
269+
},
270+
"outputs": [
271+
{
272+
"name": "stdout",
273+
"output_type": "stream",
274+
"text": [
275+
"Warning::Spark Session already created, some configs may not take.\n",
276+
"+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
277+
"|path |txt |\n",
278+
"+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
279+
"|file:/content/txt-files/simple-text.txt|[{NarrativeText, BIG DATA ANALYTICS, {paragraph -> 0}}, {NarrativeText, Apache Spark is a fast and general-purpose cluster computing system.\\nIt provides high-level APIs in Java, Scala, Python, and R., {paragraph -> 0}}, {NarrativeText, MACHINE LEARNING, {paragraph -> 1}}, {NarrativeText, Spark's MLlib provides scalable machine learning algorithms.\\nIt includes tools for classification, regression, clustering, and more., {paragraph -> 1}}]|\n",
280+
"+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
281+
"\n"
282+
]
283+
}
284+
],
285+
"source": [
286+
"params = {\"titleLengthSize\": \"5\"}\n",
287+
"txt_df = sparknlp.read(params).txt(\"./txt-files\")\n",
288+
"txt_df.show(truncate=False)"
289+
]
290+
},
291+
{
292+
"cell_type": "markdown",
293+
"metadata": {
294+
"id": "d444S-MK239M"
295+
},
296+
"source": [
297+
"- `storeContent`: By default, this is set to `false`. When enabled, the output will include the raw content of the file."
298+
]
299+
},
300+
{
301+
"cell_type": "code",
302+
"execution_count": 18,
303+
"metadata": {
304+
"colab": {
305+
"base_uri": "https://localhost:8080/"
306+
},
307+
"id": "optYF_SS22TW",
308+
"outputId": "e21f8dab-ef69-432b-aa3e-fb0afc075bbb"
309+
},
310+
"outputs": [
311+
{
312+
"name": "stdout",
313+
"output_type": "stream",
314+
"text": [
315+
"Warning::Spark Session already created, some configs may not take.\n",
316+
"+--------------------+--------------------+--------------------+\n",
317+
"| path| txt| content|\n",
318+
"+--------------------+--------------------+--------------------+\n",
319+
"|file:/content/txt...|[{Title, BIG DATA...|BIG DATA ANALYTIC...|\n",
320+
"+--------------------+--------------------+--------------------+\n",
321+
"\n"
322+
]
323+
}
324+
],
325+
"source": [
326+
"params = {\"storeContent\": \"true\"}\n",
327+
"txt_df = sparknlp.read(params).txt(\"./txt-files\")\n",
328+
"txt_df.show()"
329+
]
204330
}
205331
],
206332
"metadata": {

python/sparknlp/reader/sparknlp_reader.py

+15
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,21 @@ def txt(self, docPath):
267267
-------
268268
pyspark.sql.DataFrame
269269
A DataFrame containing parsed document content.
270+
271+
Examples
272+
--------
273+
>>> from sparknlp.reader import SparkNLPReader
274+
>>> txtDf = SparkNLPReader().txt(spark, "home/user/txt/files")
275+
276+
You can use SparkNLP for one line of code
277+
>>> import sparknlp
278+
>>> txtDf = sparknlp.read().txt("home/user/txt/files")
279+
>>> txtDf.show(truncate=False)
280+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
281+
|txt |
282+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
283+
|[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}, {NarrativeText, Apache Spark is a fast and general-purpose cluster computing system.\nIt provides high-level APIs in Java, Scala, Python, and R., {paragraph -> 0}}, {Title, MACHINE LEARNING, {paragraph -> 1}}, {NarrativeText, Spark's MLlib provides scalable machine learning algorithms.\nIt includes tools for classification, regression, clustering, and more., {paragraph -> 1}}]|
284+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
270285
"""
271286
if not isinstance(docPath, str):
272287
raise TypeError("docPath must be a string")

src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala

+1-1
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ class SparkNLPReader(params: java.util.Map[String, String] = new java.util.HashM
352352
* Parameter with custom configuration
353353
*/
354354
def txt(filePath: String): DataFrame = {
355-
val textReader = new TextReader(getTitleLengthSize)
355+
val textReader = new TextReader(getTitleLengthSize, getStoreContent)
356356
textReader.txt(filePath)
357357
}
358358

src/main/scala/com/johnsnowlabs/reader/TextReader.scala

+4-3
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import org.apache.spark.sql.functions.udf
2121

2222
import scala.collection.mutable
2323

24-
class TextReader(titleLengthSize: Int = 50) extends Serializable {
24+
class TextReader(titleLengthSize: Int = 50, storeContent: Boolean = false) extends Serializable {
2525

2626
private val spark = ResourceHelper.spark
2727
import spark.implicits._
@@ -36,9 +36,10 @@ class TextReader(titleLengthSize: Int = 50) extends Serializable {
3636
def txt(filePath: String): DataFrame = {
3737
if (ResourceHelper.validFile(filePath)) {
3838
val textFilesRDD = spark.sparkContext.wholeTextFiles(filePath)
39-
textFilesRDD
39+
val textDf = textFilesRDD
4040
.toDF("path", "content")
4141
.withColumn("txt", parseTxtUDF($"content"))
42+
if (storeContent) textDf.select("path", "txt", "content") else textDf.select("path", "txt")
4243
} else {
4344
throw new IllegalArgumentException(s"Invalid filePath: $filePath")
4445
}
@@ -102,7 +103,7 @@ class TextReader(titleLengthSize: Int = 50) extends Serializable {
102103
if (trimmed.isEmpty) return false
103104
val isAllUpper = trimmed.forall(c => !c.isLetter || c.isUpper)
104105
val isTitleCase = trimmed.split("\\s+").forall(word => word.headOption.exists(_.isUpper))
105-
val isShort = trimmed.length <= 50
106+
val isShort = trimmed.length <= titleLengthSize
106107
val hasLetters = trimmed.exists(_.isLetter)
107108
(isAllUpper || isTitleCase) && isShort && hasLetters
108109
}

src/test/scala/com/johnsnowlabs/reader/TextReaderTest.scala

+10
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,16 @@ class TextReaderTest extends AnyFlatSpec {
2929
textDf.select("txt").show(false)
3030

3131
assert(!textDf.select(col("txt").getItem(0)).isEmpty)
32+
assert(!textDf.columns.contains("content"))
33+
}
34+
35+
"Text Reader" should "store content" taggedAs FastTest in {
36+
val textReader = new TextReader(storeContent = true)
37+
val textDf = textReader.txt(txtDirectory)
38+
textDf.show()
39+
40+
assert(!textDf.select(col("txt").getItem(0)).isEmpty)
41+
assert(textDf.columns.contains("content"))
3242
}
3343

3444
}

0 commit comments

Comments
 (0)