# Проверка ссылок на изображения в блокнотах и на сетевом диске

In [1]:
github_token = 'Input your access token here'
dev_branches = ['dev-1.7', 'dev-1.8']

In [2]:
from bs4 import BeautifulSoup
import requests
import base64
import json
import os
import re

class Url:
    # this class exists solely to make url comparisons ignore whether the url uses http or https
    def __init__(self, string):
        self.protocol, self.address = string.split("://")
    def __eq__(self, other):
        return isinstance(other, Url) and self.address == other.address
    def __str__(self):
        return self.protocol + '://' + self.address
    def __hash__(self):
        return hash(self.address)
    def __lt__(self, other):
        return isinstance(other, Url) and self.address < other.address
def get_links_from_dir(url, blacklist=()):
    # recursive fuction that walks a directory tree on a server and collects all the images
    # a blacklisted directory is not checked for images
    for s in blacklist:
        if s in url:
            return []
    html_text = requests.get(url).text
    img_pattern = re.compile(r"https?://.*\.(png|jpe?g|gif|svg)\\?")
    image_links = BeautifulSoup(html_text, 'lxml').find_all(href=True)[1:]
    lecture_links = []
    for link in image_links:
        href = link.get('href')
        if href[-1] == '/':
            lecture_links += get_links_from_dir(url + href)
        else:
            if not img_pattern.search(url + href):
                continue
            lecture_links.append(Url(url + href))
    return lecture_links
def get_host_links(lec_num, blacklist=()):
    # dict for links
    host_links = {}
    print('Get host links...')
    # for each lecture
    for lecture in lec_num:
        # images on the server are generally stored in three locations:
        # - the EduNet-content folder
        # - the EduNet-web_dependencies folder
        # - thr src folder (deprecated)
        lecture_links = []
        # recursively searches the Edunet-content folder for this lecture
        url = f'https://edunet.kea.su/repo/EduNet-content/{lecture}/'
        lecture_links += get_links_from_dir(url, blacklist)
        # recursively searches the Edunet-web_dependencies folder for this lecture
        url = f'https://edunet.kea.su/repo/EduNet-web_dependencies/{lecture}/'
        lecture_links += get_links_from_dir(url, blacklist)  # recursively searches EduNet-web_dependencies
        # the directories in src are called by the full lecture title rather than just the number, so a match is needed
        # to select the right one to search
        url = f'https://edunet.kea.su/repo/src/'
        lecture_name_pattern = re.compile(r"(L\d+)(\w+)")
        html_text = requests.get(url).text
        dir_links = BeautifulSoup(html_text, 'lxml').find_all(href=True)
        for link in dir_links:
            match = lecture_name_pattern.match(link.get('href'))
            if match and match[1] == lecture:
                # recursively searches the discovered folder
                lecture_links += get_links_from_dir(f'https://edunet.kea.su/repo/src/{match[0]}/img_license/', blacklist)
                lecture_links += get_links_from_dir(f'https://edunet.kea.su/repo/src/{match[0]}/img/', blacklist)
        # add all links in dict, like dict{'L01':"all L01 links", 'L02': all L02 links ..... etc}
        host_links[lecture] = set(lecture_links) - ignore_set
    print('Host links received!')
    return host_links
def match_patterns(string, patterns):
    match = False
    for p in patterns:
        match = p.search(string)
        if match:
            break
    return match
def strip_protocol(url: str):
    return url.lstrip("https")
def get_lectures_links(dev_branches, github_token, repo_url):
    # iterates over the branches of a github repository, and searches lecture files within for links to images
    img_pattern = re.compile(r"<img(?:\s[^<>\"\']*)?\ssrc\s*=\s*\"([^<>\"\']*)\".*?>")  # pattern for image markdown in notebook
    # pattern for a url to an image stored on the server
    path_pattern = re.compile(
        r"https?://edunet\.kea\.su/repo/(EduNet-(?:(?:content)|(?:web_dependencies))/(L\d+)/\S+?\.(png|jpe?g|gif|svg))")
    non_disk_pattern = re.compile(r"https?://[^\"\']*?\.(png|jpe?g|gif|svg)\\?")  # a general image url pattern
    url_patterns = [path_pattern, non_disk_pattern]
    var_patten = re.compile(r'(?:' + path_pattern.pattern + r"|" + non_disk_pattern.pattern + r')')
    headers = {}
    if github_token:
        headers['Authorization'] = f"token {github_token}"
    res_links = {}
    # get the lecture names for the default branch
    url = repo_url + f'/contents/out?ref={dev_branches[0]}'
    r = requests.get(url, headers=headers)
    r.raise_for_status()
    lec_data = r.json()
    for branch in dev_branches:
        print(f'\tGet branch {branch}')
        lecture_names_tmp = [x['name'] for x in lec_data[2:17]]
        lecture_names = []
        for lecture in lecture_names_tmp:
            if lecture.endswith(".ipynb"):
                lecture_names.append(lecture)
        for lecture in lecture_names:
            # get the contents of the lecture folder on this branch
            url = repo_url + f"/contents/out/{lecture}?ref={branch}"
            r = requests.get(url, headers=headers)
            r.raise_for_status()
            data = r.json()
            if 'content' not in data:  # shorthand for a filename being a directory
                for d in data:
                    lecture_names.append(lecture + '/' + d['name'])  # breadth-first search over subdirectories
                continue
            if 'ipynb' not in url:  # the only non-directory files we are searching for are jupyter notebooks
                continue
            file_content = data['content']
            file_content_encoding = data.get('encoding')
            if file_content_encoding == 'base64':
                file_content = base64.b64decode(file_content).decode()
            file_content = json.loads(file_content)
            links = []
            for cell in file_content["cells"]:
                lines = [s.strip(" \r\t\n\"\'") for s in cell["source"]]
                text = "".join(lines)
                if cell["cell_type"] == 'code':  # search source code files for urls used as variables
                    for match in var_patten.finditer(text):
                        if match:
                            links.append(Url(match.group(0)))
                for match in img_pattern.finditer("".join(lines)):  # search for embedded images in every cell
                    if not match:
                        continue
                    link = match.group(1)
                    match = match_patterns(link, url_patterns)
                    if match:
                        links.append(Url(link))
            lcode = str(lecture[:3])
            if lcode in res_links:
                res_links[lcode] = res_links[lcode].union(set(links))
            else:
                res_links[lcode] = set(links)
    print('All links received!')
    return res_links

if __name__ == '__main__':
    ignore_list = ['https://edunet.kea.su/repo/EduNet-web_dependencies/L02/student_scores.csv',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L03/heart.csv',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L04/lfwcrop_grey.zip',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L04/rnaseq_data.tab.txt',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L04/titanic.csv',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L04/scRNAseq_CITEseq.txt',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L04/fetal_health.csv',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L05/lc_cifar10_weights.txt',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L08/airline-passengers.csv',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L09/imagenet_class_index.json',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L09/imagen.zip',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L09/imagenet1000_clsidx_to_labels.txt',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L10/imagenet_class_index.json',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L10/boston_dataset.csv',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L11/OSCD.zip',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L11/OSCD2.zip',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L11/audio_example.wav',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L11/GTdb_crop.zip',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L11/for_transforms.Compose.zip',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L11/EuroSAT.zip',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L11/small_face_dataset.zip',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L12/cheetah_video.mp4',
                   'https://edunet.kea.su/repo/EduNet-web_dependencies/L13/weights_stylegan.pt']
    
    ignore_set = set([Url(s) for s in ignore_list])

    blacklist = ['https://edunet.kea.su/repo/src']

    lec_num = ['L01', 'L02', 'L03', 'L04', 'L05', 'L06', 'L07', 'L08', 'L09', 'L10', 'L11', 'L12', 'L13', 'L14', 'L15']
    host_links = get_host_links(lec_num, blacklist)
    print("Get secret links:")
    secret_links = get_lectures_links(dev_branches, github_token, 'https://api.github.com/repos/EPC-MSU/EduNet-secret')

    print("Get lectures links:")
    lectures_links = get_lectures_links(dev_branches, github_token, 'https://api.github.com/repos/EPC-MSU/EduNet-lectures')
    
    # make sure that all the dictionaries have fields for every lecture
    for key in lec_num:
        if key not in lectures_links:
            lectures_links[key] = set()
        if key not in secret_links:
            secret_links[key] = set()

    # generate a union of all host links
    host_links['all'] = set()
    for i in lec_num:
        host_links['all'] = host_links['all'].union(host_links[i])
    with open("links_info.txt", "w+") as f:
        for i in lec_num:
            print(f'* * * Лекция {i} * * * \n', file=f)
            print(f'Есть на сетевом нет в лекции:\n', file=f)
            lectures_links[i] = lectures_links[i]
            if i not in host_links:
                host_links[i] = set()
            if i not in lectures_links:
                lectures_links[i] = set()
            for link in sorted(list(host_links[i] - lectures_links[i])):
                print(f'\t{link}\n', file=f)
            print(f'\nЕсть в лекции нет на сетевом: \n', file=f)
            if len(list(lectures_links[i] - host_links['all'])) == 0:
                print('Все ссылки есть на сетевом!\n', file=f)
            else:
                for link in sorted(list(lectures_links[i] - host_links['all'])):
                    print(f'\t{link}\n', file=f)

            # prints warnings when a lecture links to another lecture's folder
            # for j in lec_num:
            #     if i == j:
            #         continue
            #     links = sorted(list(lectures_links[i].intersection(host_links[j])))
            #     if len(links) != 0:
            #         print(f'\nНет на сетевом в папке этой лекции, но есть в папке {j}: \n', file=f)
            #     for link in links:
            #         print(f'\t{link}\n', file=f)
            print(f'\nЕсть в заданиях нет на сетевом: \n', file=f)
            if len(list(secret_links[i] - host_links[i])) == 0:
                print('Все ссылки есть на сетевом!\n', file=f)
            else:
                for link in sorted(list(secret_links[i] - host_links[i])):
                    print(f'\t{link}\n', file=f)

Get host links...
Host links received!
Get secret links:
	Get branch dev-1.7
	Get branch dev-1.8
All links received!
Get lectures links:
	Get branch dev-1.7
	Get branch dev-1.8
All links received!
