[SPARKNLP-1117] Adding storeContent param

danilojsl · danilojsl · commit cdb8f360b702 · 2025-03-06T19:39:14.000-05:00
diff --git a/examples/python/reader/SparkNLP_TXT_Reader_Demo.ipynb b/examples/python/reader/SparkNLP_TXT_Reader_Demo.ipynb
@@ -2,7 +2,9 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "id": "0o5UQ-Gy2Xvr"
+   },
    "source": [
     "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n",
     "\n",
@@ -58,23 +60,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
+   "execution_count": 1,
+   "metadata": {
+    "id": "xrWTskQJ2Xv5"
+   },
    "outputs": [],
    "source": [
     "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "id": "9B98jlOn2Xv8"
+   },
    "source": [
     "For local files example we will download a TXT file from Spark NLP Github repo:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": 11,
    "metadata": {
     "application/vnd.databricks.v1+cell": {
      "cellMetadata": {
@@ -91,12 +97,31 @@
      "base_uri": "https://localhost:8080/"
     },
     "id": "ya8qZe00dalC",
-    "outputId": "268ccacb-ba1c-4753-f251-014fb0003f38"
+    "outputId": "144186be-781d-451b-894e-d9c590a93c6a"
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mkdir: cannot create directory ‘txt-files’: File exists\n",
+      "--2025-03-07 00:33:21--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1113-Adding-support-to-enhance-read-TXT-files/src/test/resources/reader/txt/simple-text.txt\n",
+      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...\n",
+      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 300 [text/plain]\n",
+      "Saving to: ‘txt-files/simple-text.txt’\n",
+      "\n",
+      "simple-text.txt     100%[===================>]     300  --.-KB/s    in 0s      \n",
+      "\n",
+      "2025-03-07 00:33:21 (4.67 MB/s) - ‘txt-files/simple-text.txt’ saved [300/300]\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
     "!mkdir txt-files\n",
-    "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/src/test/resources/reader/txt/simple-text.txt -P txt-files"
+    "!wget https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/feature/SPARKNLP-1113-Adding-support-to-enhance-read-TXT-files/src/test/resources/reader/txt/simple-text.txt -P txt-files"
    ]
   },
   {
@@ -122,7 +147,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": 12,
    "metadata": {
     "application/vnd.databricks.v1+cell": {
      "cellMetadata": {
@@ -139,33 +164,33 @@
      "base_uri": "https://localhost:8080/"
     },
     "id": "bAkMjJ1vdalE",
-    "outputId": "a0a2e727-fcc3-474b-eaaa-20bf15f19773"
+    "outputId": "74f0e218-6378-4df4-9b12-3ee6e33020e6"
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Warning::Spark Session already created, some configs may not take.\n",
-      "+--------------------+--------------------+--------------------+\n",
-      "|                path|             content|                 txt|\n",
-      "+--------------------+--------------------+--------------------+\n",
-      "|dbfs:/danilo/data...|BIG DATA ANALYTIC...|[{Title, BIG DATA...|\n",
-      "+--------------------+--------------------+--------------------+\n",
+      "+--------------------+--------------------+\n",
+      "|                path|                 txt|\n",
+      "+--------------------+--------------------+\n",
+      "|file:/content/txt...|[{Title, BIG DATA...|\n",
+      "+--------------------+--------------------+\n",
       "\n"
      ]
     }
    ],
    "source": [
     "import sparknlp\n",
-    "txt_df = sparknlp.read().txt(\"dbfs:/danilo/datasets/txt\")\n",
     "\n",
+    "txt_df = sparknlp.read().txt(\"./txt-files\")\n",
     "txt_df.show()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 0,
+   "execution_count": 13,
    "metadata": {
     "application/vnd.databricks.v1+cell": {
      "cellMetadata": {
@@ -182,7 +207,7 @@
      "base_uri": "https://localhost:8080/"
     },
     "id": "4iky1gvEz7Pt",
-    "outputId": "a986947b-f874-46bc-88c8-093dc42c83cb"
+    "outputId": "ead23526-18be-4bb9-e952-38ef3d483cb0"
    },
    "outputs": [
     {
@@ -201,6 +226,107 @@
    "source": [
     "txt_df.select(\"txt\").show(truncate=False)"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "brto-6NX2wLT"
+   },
+   "source": [
+    "You can also use DFS file systems like:\n",
+    "- Databricks: `dbfs://`\n",
+    "- HDFS: `hdfs://`\n",
+    "- Microsoft Fabric OneLake: `abfss://`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "CYnoVMVD211Z"
+   },
+   "source": [
+    "### Configuration Parameters"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "rJhyeem_3Gqh"
+   },
+   "source": [
+    "- `titleLengthSize`: You can customize the font size used to identify titles that should be treated as titles. By default, the font size is set to 50. However, if your text files require a different configuration, you can adjust this parameter accordingly. The example below demonstrates how to modify and work with this setting:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "nLUtWTk-3jcT",
+    "outputId": "60d10ba0-cf91-4706-efb4-4e640d7e6bb0"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning::Spark Session already created, some configs may not take.\n",
+      "+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|path                                   |txt                                                                                                                                                                                                                                                                                                                                                                                                                                                        |\n",
+      "+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "|file:/content/txt-files/simple-text.txt|[{NarrativeText, BIG DATA ANALYTICS, {paragraph -> 0}}, {NarrativeText, Apache Spark is a fast and general-purpose cluster computing system.\\nIt provides high-level APIs in Java, Scala, Python, and R., {paragraph -> 0}}, {NarrativeText, MACHINE LEARNING, {paragraph -> 1}}, {NarrativeText, Spark's MLlib provides scalable machine learning algorithms.\\nIt includes tools for classification, regression, clustering, and more., {paragraph -> 1}}]|\n",
+      "+---------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "params = {\"titleLengthSize\": \"5\"}\n",
+    "txt_df = sparknlp.read(params).txt(\"./txt-files\")\n",
+    "txt_df.show(truncate=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "d444S-MK239M"
+   },
+   "source": [
+    "- `storeContent`: By default, this is set to `false`. When enabled, the output will include the raw content of the file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "optYF_SS22TW",
+    "outputId": "e21f8dab-ef69-432b-aa3e-fb0afc075bbb"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning::Spark Session already created, some configs may not take.\n",
+      "+--------------------+--------------------+--------------------+\n",
+      "|                path|                 txt|             content|\n",
+      "+--------------------+--------------------+--------------------+\n",
+      "|file:/content/txt...|[{Title, BIG DATA...|BIG DATA ANALYTIC...|\n",
+      "+--------------------+--------------------+--------------------+\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "params = {\"storeContent\": \"true\"}\n",
+    "txt_df = sparknlp.read(params).txt(\"./txt-files\")\n",
+    "txt_df.show()"
+   ]
   }
  ],
  "metadata": {
diff --git a/python/sparknlp/reader/sparknlp_reader.py b/python/sparknlp/reader/sparknlp_reader.py
@@ -267,6 +267,21 @@ def txt(self, docPath):
         -------
         pyspark.sql.DataFrame
             A DataFrame containing parsed document content.
+
+        Examples
+        --------
+        >>> from sparknlp.reader import SparkNLPReader
+        >>> txtDf = SparkNLPReader().txt(spark, "home/user/txt/files")
+
+        You can use SparkNLP for one line of code
+        >>> import sparknlp
+        >>> txtDf = sparknlp.read().txt("home/user/txt/files")
+        >>> txtDf.show(truncate=False)
+        +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+        |txt                                                                                                                                                                                                                                                                                                                                                                                                                                        |
+        +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+        |[{Title, BIG DATA ANALYTICS, {paragraph -> 0}}, {NarrativeText, Apache Spark is a fast and general-purpose cluster computing system.\nIt provides high-level APIs in Java, Scala, Python, and R., {paragraph -> 0}}, {Title, MACHINE LEARNING, {paragraph -> 1}}, {NarrativeText, Spark's MLlib provides scalable machine learning algorithms.\nIt includes tools for classification, regression, clustering, and more., {paragraph -> 1}}]|
+        +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
         """
         if not isinstance(docPath, str):
             raise TypeError("docPath must be a string")
diff --git a/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala b/src/main/scala/com/johnsnowlabs/reader/SparkNLPReader.scala
@@ -352,7 +352,7 @@ class SparkNLPReader(params: java.util.Map[String, String] = new java.util.HashM
     *   Parameter with custom configuration
     */
   def txt(filePath: String): DataFrame = {
-    val textReader = new TextReader(getTitleLengthSize)
+    val textReader = new TextReader(getTitleLengthSize, getStoreContent)
     textReader.txt(filePath)
   }
 
diff --git a/src/main/scala/com/johnsnowlabs/reader/TextReader.scala b/src/main/scala/com/johnsnowlabs/reader/TextReader.scala
@@ -21,7 +21,7 @@ import org.apache.spark.sql.functions.udf
 
 import scala.collection.mutable
 
-class TextReader(titleLengthSize: Int = 50) extends Serializable {
+class TextReader(titleLengthSize: Int = 50, storeContent: Boolean = false) extends Serializable {
 
   private val spark = ResourceHelper.spark
   import spark.implicits._
@@ -36,9 +36,10 @@ class TextReader(titleLengthSize: Int = 50) extends Serializable {
   def txt(filePath: String): DataFrame = {
     if (ResourceHelper.validFile(filePath)) {
       val textFilesRDD = spark.sparkContext.wholeTextFiles(filePath)
-      textFilesRDD
+      val textDf = textFilesRDD
         .toDF("path", "content")
         .withColumn("txt", parseTxtUDF($"content"))
+      if (storeContent) textDf.select("path", "txt", "content") else textDf.select("path", "txt")
     } else {
       throw new IllegalArgumentException(s"Invalid filePath: $filePath")
     }
@@ -102,7 +103,7 @@ class TextReader(titleLengthSize: Int = 50) extends Serializable {
     if (trimmed.isEmpty) return false
     val isAllUpper = trimmed.forall(c => !c.isLetter || c.isUpper)
     val isTitleCase = trimmed.split("\\s+").forall(word => word.headOption.exists(_.isUpper))
-    val isShort = trimmed.length <= 50
+    val isShort = trimmed.length <= titleLengthSize
     val hasLetters = trimmed.exists(_.isLetter)
     (isAllUpper || isTitleCase) && isShort && hasLetters
   }
diff --git a/src/test/scala/com/johnsnowlabs/reader/TextReaderTest.scala b/src/test/scala/com/johnsnowlabs/reader/TextReaderTest.scala
@@ -29,6 +29,16 @@ class TextReaderTest extends AnyFlatSpec {
     textDf.select("txt").show(false)
 
     assert(!textDf.select(col("txt").getItem(0)).isEmpty)
+    assert(!textDf.columns.contains("content"))
+  }
+
+  "Text Reader" should "store content" taggedAs FastTest in {
+    val textReader = new TextReader(storeContent = true)
+    val textDf = textReader.txt(txtDirectory)
+    textDf.show()
+
+    assert(!textDf.select(col("txt").getItem(0)).isEmpty)
+    assert(textDf.columns.contains("content"))
   }
 
 }

Original file line number	Diff line number	Diff line change
`@@ -352,7 +352,7 @@ class SparkNLPReader(params: java.util.Map[String, String] = new java.util.HashM`
`352`	`352`	`* Parameter with custom configuration`
`353`	`353`	`*/`
`354`	`354`	`def txt(filePath: String): DataFrame = {`
`355`		`- val textReader = new TextReader(getTitleLengthSize)`
	`355`	`+ val textReader = new TextReader(getTitleLengthSize, getStoreContent)`
`356`	`356`	`textReader.txt(filePath)`
`357`	`357`	`}`
`358`	`358`