Hello!
I want to extract all the text on a slide. To do this, I am going through each shape on the slide and checking for text. However, some shapes are groups shapes. For these, I need to use the sub_shape parameter. But what if a group shape is nested within a group shape? For such scenarios, I made a recursive function that either grabs the text or dives deeper into the inner group shape.
The function seems to mostly work, however there are weird edge cases where Aspose returns the wrong number of nested shapes. Specifically, if I call aspose_slides_api.get_shapes() on a triply nested group_shape, an incorrect, partial set of shape_links is returned.
Below is my code and attached is the file you can use if you would like to replicate this bug.
from asposeslidescloud.apis.slides_api import SlidesApi
ASPOSE_APP_SID = "side"
ASPOSE_APP_KEY = "key"
STORAGE = "Dev Internal Storage"
slides_api = SlidesApi(app_sid=ASPOSE_APP_SID, app_key=ASPOSE_APP_KEY)
def aspose_slides_upload_file(slides_api, aspose_file_name: str, file_bytes: bytes):
try:
slides_api.upload_file(
path=aspose_file_name, file=file_bytes, storage_name=STORAGE
)
except Exception as e:
print("Exception in aspose_slides_upload_file: " + str(e))
raise e
def aspose_slides_get_slides(slides_api, aspose_file_name: str, aspose_folder: str):
try:
return slides_api.get_slides(
name=aspose_file_name, folder=aspose_folder, storage=STORAGE
)
except Exception as e:
print("Exception in aspose_slides_get_slides: " + str(e))
raise e
def aspose_slides_get_shapes(
slides_api,
aspose_file_name: str,
aspose_folder: str,
slide_index: int,
sub_shape: str = None,
):
try:
return slides_api.get_shapes(
name=aspose_file_name,
folder=aspose_folder,
storage=STORAGE,
slide_index=slide_index,
sub_shape=sub_shape,
)
except Exception as e:
print("Exception in aspose_slides_get_shapes: " + str(e))
raise e
def aspose_slides_get_shape_info(
slides_api,
aspose_file_name: str,
aspose_folder: str,
slide_index: int,
shape_index: int,
sub_shape: str = None,
):
try:
return slides_api.get_shape(
name=aspose_file_name,
folder=aspose_folder,
storage=STORAGE,
slide_index=slide_index,
shape_index=shape_index,
sub_shape=sub_shape,
)
except Exception as e:
print("Exception in aspose_slides_get_shape_info: " + str(e))
raise e
def aspose_slides_get_text_from_group_shapes(
slides_api,
aspose_file_name: str,
slide_index: int,
path_for_get_shapes: str,
path_for_get_details: str,
outermost_group_shape_idx: int,
) -> list:
print()
shapes = aspose_slides_get_shapes(
slides_api=slides_api,
aspose_file_name=aspose_file_name,
aspose_folder="",
slide_index=slide_index,
sub_shape=path_for_get_shapes,
)
# This is where things are going wrong.
# The wrong number of shapes gets returned for shapes nested within group shapes
print(f"number of shapes: {len(shapes.shapes_links)}")
text_list = []
for shape in shapes.shapes_links:
print(f"shape_index: {shape.shape_index}")
if not path_for_get_details:
details_sub_path = shape.shape_index
else:
details_sub_path = f"{path_for_get_details}/shapes/{shape.shape_index}"
shape_details = aspose_slides_get_shape_info(
slides_api=slides_api,
aspose_file_name=aspose_file_name,
aspose_folder="",
shape_index=outermost_group_shape_idx,
slide_index=slide_index,
sub_shape=details_sub_path,
)
print(f"shape_name: {shape_details.name}")
if hasattr(shape_details, "text"):
text_list.append(shape_details.text)
if shape_details.type == "GroupShape" and shape_details.shapes:
print(f"group shape found: {shape_details.name}. Diving deeper...")
new_words = aspose_slides_get_text_from_group_shapes(
slides_api=slides_api,
aspose_file_name=aspose_file_name,
slide_index=slide_index,
path_for_get_shapes=f"{outermost_group_shape_idx}/shapes/{shape.shape_index}",
path_for_get_details=details_sub_path,
outermost_group_shape_idx=outermost_group_shape_idx,
)
text_list.extend(new_words)
return text_list
def main():
aspose_file_name = "nested_test.pptx"
with open("input/nested_test.pptx", "rb") as f:
file_bytes = f.read()
aspose_slides_upload_file(
slides_api=slides_api, aspose_file_name=aspose_file_name, file_bytes=file_bytes
)
slides = aspose_slides_get_slides(
slides_api=slides_api, aspose_file_name=aspose_file_name, aspose_folder=""
)
text_list = []
for slide_index, slide in enumerate(slides.slide_list, start=1):
print()
print(f"new slide: {slide_index}")
# Get all shapes on the slide
shapes = aspose_slides_get_shapes(
slides_api=slides_api,
aspose_file_name=aspose_file_name,
aspose_folder="",
slide_index=slide_index,
)
for shape in shapes.shapes_links:
shape_details = aspose_slides_get_shape_info(
slides_api=slides_api,
aspose_file_name=aspose_file_name,
aspose_folder="",
shape_index=shape.shape_index,
slide_index=slide_index,
)
print(f"number of shapes: {len(shapes.shapes_links)}")
print(f"shape_index: {shape.shape_index}")
print(f"shape_name: {shape_details.name}")
if hasattr(shape_details, "text"):
text_list.append(shape_details.text)
if shape_details.type == "GroupShape" and shape_details.shapes:
# Group shapes must be recursively checked for nested text
print(f"group shape found: {shape_details.name}. Diving deeper...")
nested_text = aspose_slides_get_text_from_group_shapes(
slides_api=slides_api,
aspose_file_name=aspose_file_name,
slide_index=slide_index,
path_for_get_shapes=shape.shape_index,
path_for_get_details="",
outermost_group_shape_idx=shape.shape_index,
)
if __name__ == "__main__":
main()
Here is the output I get when running this code:
new slide: 1
number of shapes: 1
shape_index: 1
shape_name: Group 5
group shape found: Group 5. Diving deeper...
number of shapes: 2
shape_index: 1
shape_name: TextBox 6
shape_index: 2
shape_name: Group 7
group shape found: Group 7. Diving deeper...
number of shapes: 2
shape_index: 1
shape_name: Smiley Face 8
shape_index: 2
shape_name: Group 9
group shape found: Group 9. Diving deeper...
number of shapes: 2 -> This is wrong. There are 3 shapes inside this group.
shape_index: 1
shape_name: Picture 10
shape_index: 2
shape_name: TextBox 11
nested_test.pptx.zip (439.1 KB)