Python – 第 4 页 – Pancake's Personal Website

偏最小二乘法（PLS）Python参考代码

PLS 网上并没有找到严格的 多元高次线性回归 的严格证明，但经过测试我验证了代码的可用性，一定情况下高次（即可包含如x*x的项）拟合r_²更加高，但很多时候并不是如此，只是提供一种优化的可能，只需要改 polynomial 参数即可。代码PLS全部改自 matlab 代码，并自行加入画图、高次项的优化和r²的评价函数。该代码基本已是完整代码，只有 def Polynomial(self): 函数代码有缺陷，如有更好想法可以一起改进。

如要搬运请注明出处，并和我联系谢谢。

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import preprocessing
from numpy.matlib import repmat
import csv
from math import ceil


class Linear:
    def __init__(self, dependent, document, polynomial=1):
        self.dependent = dependent
        self.document = document
        self.polynomial = polynomial
        self.openfile()
        self.n = len(self.df.columns) - self.dependent
        self.Polynomial()
        x0, y0, num, xishu, ch0, xish, sol = self.find()
        self.save(sol=sol)
        self.PLOT(ch0=ch0, num=num, x0=x0, y0=y0, xishu=xishu, xish=xish)

    def openfile(self):
        file_type = self.document.split(".")[-1]
        if file_type == "csv":
            self.df = pd.read_csv(self.document, encoding='GBK')
        elif file_type == "xlsx" or file_type == "xls":
            self.df = pd.read_excel(self.document)
        else:
            exit("Unknown file type")

    def Polynomial(self):
        if self.polynomial != 1:
            temp = self.df.iloc[:, -self.dependent:]
            self.df.drop(self.df.columns[-self.dependent:], axis=1, inplace=True)
            count = self.n
            count_begin = 0
            for i in range(1, self.polynomial):
                count_end = count
                for k in range(self.n):
                    for j in range(count_begin, count_end):
                        name = "x" + str(k + 1) + str(j + 1) if i == 1 else "x" + str(k + 1) + self.df.columns[j][1:]
                        count += 1
                        self.df[name] = self.df.iloc[:, j].mul(self.df.iloc[:, k])
                count_begin = count_end
            for i in range(len(temp.columns)):
                self.df[temp.columns[i]] = temp.iloc[:, i]
            self.df.to_csv("changed.csv", encoding='GBK')

    def find(self):
        df = self.df
        df_matrix = np.array(df)
        mu = np.mean(df_matrix, axis=0)
        sig = np.std(df_matrix, axis=0)
        rr = df.corr()
        rr.to_csv("相关系数矩阵.csv", encoding='GBK')
        data = preprocessing.scale(df_matrix)
        m = self.dependent
        n = len(df.columns) - m
        self.n = n
        x0 = df_matrix[:, :n]
        y0 = df_matrix[:, n:]
        e0 = data[:, :n]
        f0 = data[:, n:]
        num = len(df.iloc[:, 0])
        chg = np.identity(n)
        w = np.zeros([n, n])
        w_star = np.zeros([n, n])
        t = np.zeros([num, n])
        ss = []
        Q_h2 = []
        press_i = [0 for i in range(num)]
        press = [0 for i in range(n)]
        flag = 0
        for i in range(n):
            matrix = e0.T @ f0 @ f0.T @ e0
            [val, vec] = np.linalg.eig(matrix)
            val = val.argsort()
            w[:, i] = vec[:, val[len(val) - 1]]
            w_star[:, i] = chg @ w[:, i]
            t[:, i] = e0 @ w[:, i]
            alpha = [e0.T @ t[:, i] / (t[:, i].T @ t[:, i])]
            chg = chg @ (np.identity(n) - w[:, i:i + 1] @ alpha)
            e0 = e0 - t[:, i:i + 1] @ alpha
            beta = np.linalg.pinv(np.c_[t[:, :i + 1], np.ones(num)]) @ f0
            beta = np.delete(beta, (-1), axis=0)
            cancha = f0 - t[:, :i + 1] @ beta
            cancha = np.array([[cancha[i][j] ** 2 for j in range(len(cancha[i]))] for i in range(len(cancha))])
            ss.append(cancha.sum())
            for j in range(num):
                t1 = t[:, :i + 1]
                f1 = f0
                she_t = t1[j:j + 1, :]
                she_f = f1[j:j + 1, :]
                t1 = np.delete(t1, j, axis=0)
                f1 = np.delete(f1, j, axis=0)
                beta1 = np.linalg.pinv(np.c_[t1, np.ones(num - 1)]) @ f1
                beta1 = np.delete(beta1, (-1), axis=0)
                cancha = she_f - she_t @ beta1
                cancha = np.array([[cancha[i][j] ** 2 for j in range(len(cancha[i]))] for i in range(len(cancha))])
                press_i[j] = cancha.sum()
            press[i] = np.array(press_i).sum()
            if i > 0:
                Q_h2.append(1 - press[i] / ss[i - 1])
                # print('Q_h2[{}] = {}'.format(i, (1 - press[i] / ss[i - 1])))
            else:
                Q_h2.append(1)
            if Q_h2[i] < 0.0975:
                # print('Number of components proposedr = %d' % (i + 1))
                print("Q_h2 = {}".format(Q_h2[-1]))
                r = i
                flag = 1
                break
        if not flag:
            exit("Can't find")
        beta_z = np.linalg.pinv(np.c_[t[:, :r + 1], np.ones(num)]) @ f0
        beta_z = np.delete(beta_z, (-1), axis=0)
        xishu = w_star[:, :r + 1] @ beta_z
        mu_x = mu[:n]
        mu_y = mu[n:]
        sig_x = sig[:n]
        sig_y = sig[n:]
        ch0 = []
        for i in range(m):
            ch0.append(float(mu_y[i] - np.true_divide(mu_x, sig_x) * sig_y[i] @ xishu[:, i:i + 1]))
        xish = np.zeros([n, m])
        for i in range(m):
            xish[:, i] = np.true_divide(xishu[:, i], sig_x.T) * sig_y[i]
        sol = np.r_[np.array([ch0]), xish]
        # 防止报错 nan_2_0
        # x0, y0, num, xishu, ch0, xish, sol = map(lambda x: np.nan_to_num(x), [x0, y0, num, xishu, ch0, xish, sol])
        return x0, y0, num, xishu, ch0, xish, sol

    def PLOT(self, ch0, num, x0, y0, xishu, xish):
        plt.rcParams['font.sans-serif'] = ['SimHei']  # 用黑体显示中文
        plt.rcParams['axes.unicode_minus'] = False
        ch0 = repmat(ch0, num, 1)
        y_hat = ch0 + x0 @ xish
        y1max = y_hat.max(axis=0)
        y2max = y0.max(axis=0)
        ymax = np.r_[np.array([y1max]), np.array([y2max])].max(axis=0)
        for i in range(self.dependent):
            print("y{}: R^2 score = {}".format(i + 1, R2_func(y_hat[:, i], self.df.iloc[:, -self.dependent + i])))
            plt.subplot(self.dependent, 2, i * 2 + 1)
            x = [-1, ceil(ymax[i])]
            plt.plot(x, x, '-')
            plt.plot(y_hat[:, i], y0[:, i], '*')
            plt.title("y{}".format(i + 1))
            plt.subplot(self.dependent, 2, i * 2 + 2)
            x = np.arange(self.n)
            plt.bar(x, height=xishu[:, i].reshape([1, self.n], order='F')[0], width=0.5)
            plt.plot([0, self.n], [0, 0], "-")
            plt.title("y{}".format(i + 1))
        plt.tight_layout()
        plt.savefig("verify.jpg")
        plt.show()

    def save(self, sol):
        sol = np.r_[[["y{}".format(i + 1) for i in range(self.dependent)]], sol]
        sol = np.c_[["dependent", "x0"] + list(self.df.columns[:-self.dependent]), sol]
        print(sol)
        with open("result.csv", "w", newline="") as file:
            writer = csv.writer(file)
            writer.writerows(sol)


def R2_func(y_test, y):
    return 1 - ((y_test - y) ** 2).sum() / ((y.mean() - y) ** 2).sum()


if __name__ == '__main__':
    # 因变量个数，文件位置，次数
    # 文件格式说明：第一行为数据名称（非数据），每一列不能全为0，请自行删除全为0的列
    # 文件格式说明：第一列拒绝 index ， 前面 m 列为自变量， 后面 n 列为因变量
    # 文件类型说明： 支持 .csv .xlsx .xls
    Linear(1, "PLS2.csv", 1)

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

import matplotlib.pyplot as plt

import pandas as pd

import numpy as np

from sklearn import preprocessing

from numpy.matlib import repmat

import csv

from math import ceil

class Linear:

def __init__(self, dependent, document, polynomial=1):

self.dependent = dependent

self.document = document

self.polynomial = polynomial

self.openfile()

self.n = len(self.df.columns) - self.dependent

self.Polynomial()

x0, y0, num, xishu, ch0, xish, sol = self.find()

self.save(sol=sol)

self.PLOT(ch0=ch0, num=num, x0=x0, y0=y0, xishu=xishu, xish=xish)

def openfile(self):

file_type = self.document.split(".")[-1]

if file_type == "csv":

self.df = pd.read_csv(self.document, encoding='GBK')

elif file_type == "xlsx" or file_type == "xls":

self.df = pd.read_excel(self.document)

else:

exit("Unknown file type")

def Polynomial(self):

if self.polynomial != 1:

temp = self.df.iloc[:, -self.dependent:]

self.df.drop(self.df.columns[-self.dependent:], axis=1, inplace=True)

count = self.n

count_begin = 0

for i in range(1, self.polynomial):

count_end = count

for k in range(self.n):

for j in range(count_begin, count_end):

name = "x" + str(k + 1) + str(j + 1) if i == 1 else "x" + str(k + 1) + self.df.columns[j][1:]

count += 1

self.df[name] = self.df.iloc[:, j].mul(self.df.iloc[:, k])

count_begin = count_end

for i in range(len(temp.columns)):

self.df[temp.columns[i]] = temp.iloc[:, i]

self.df.to_csv("changed.csv", encoding='GBK')

def find(self):

df = self.df

df_matrix = np.array(df)

mu = np.mean(df_matrix, axis=0)

sig = np.std(df_matrix, axis=0)

rr = df.corr()

rr.to_csv("相关系数矩阵.csv", encoding='GBK')

data = preprocessing.scale(df_matrix)

m = self.dependent

n = len(df.columns) - m

self.n = n

x0 = df_matrix[:, :n]

y0 = df_matrix[:, n:]

e0 = data[:, :n]

f0 = data[:, n:]

num = len(df.iloc[:, 0])

chg = np.identity(n)

w = np.zeros([n, n])

w_star = np.zeros([n, n])

t = np.zeros([num, n])

ss = []

Q_h2 = []

press_i = [0 for i in range(num)]

press = [0 for i in range(n)]

flag = 0

for i in range(n):

matrix = e0.T @ f0 @ f0.T @ e0

[val, vec] = np.linalg.eig(matrix)

val = val.argsort()

w[:, i] = vec[:, val[len(val) - 1]]

w_star[:, i] = chg @ w[:, i]

t[:, i] = e0 @ w[:, i]

alpha = [e0.T @ t[:, i] / (t[:, i].T @ t[:, i])]

chg = chg @ (np.identity(n) - w[:, i:i + 1] @ alpha)

e0 = e0 - t[:, i:i + 1] @ alpha

beta = np.linalg.pinv(np.c_[t[:, :i + 1], np.ones(num)]) @ f0

beta = np.delete(beta, (-1), axis=0)

cancha = f0 - t[:, :i + 1] @ beta

cancha = np.array([[cancha[i][j] ** 2 for j in range(len(cancha[i]))] for i in range(len(cancha))])

ss.append(cancha.sum())

for j in range(num):

t1 = t[:, :i + 1]

f1 = f0

she_t = t1[j:j + 1, :]

she_f = f1[j:j + 1, :]

t1 = np.delete(t1, j, axis=0)

f1 = np.delete(f1, j, axis=0)

beta1 = np.linalg.pinv(np.c_[t1, np.ones(num - 1)]) @ f1

beta1 = np.delete(beta1, (-1), axis=0)

cancha = she_f - she_t @ beta1

cancha = np.array([[cancha[i][j] ** 2 for j in range(len(cancha[i]))] for i in range(len(cancha))])

press_i[j] = cancha.sum()

press[i] = np.array(press_i).sum()

if i > 0:

Q_h2.append(1 - press[i] / ss[i - 1])

# print('Q_h2[{}] = {}'.format(i, (1 - press[i] / ss[i - 1])))

else:

Q_h2.append(1)

if Q_h2[i] < 0.0975:

# print('Number of components proposedr = %d' % (i + 1))

print("Q_h2 = {}".format(Q_h2[-1]))

r = i

flag = 1

break

if not flag:

exit("Can't find")

beta_z = np.linalg.pinv(np.c_[t[:, :r + 1], np.ones(num)]) @ f0

beta_z = np.delete(beta_z, (-1), axis=0)

xishu = w_star[:, :r + 1] @ beta_z

mu_x = mu[:n]

mu_y = mu[n:]

sig_x = sig[:n]

sig_y = sig[n:]

ch0 = []

for i in range(m):

ch0.append(float(mu_y[i] - np.true_divide(mu_x, sig_x) * sig_y[i] @ xishu[:, i:i + 1]))

xish = np.zeros([n, m])

for i in range(m):

xish[:, i] = np.true_divide(xishu[:, i], sig_x.T) * sig_y[i]

sol = np.r_[np.array([ch0]), xish]

# 防止报错 nan_2_0

# x0, y0, num, xishu, ch0, xish, sol = map(lambda x: np.nan_to_num(x), [x0, y0, num, xishu, ch0, xish, sol])

return x0, y0, num, xishu, ch0, xish, sol

def PLOT(self, ch0, num, x0, y0, xishu, xish):

plt.rcParams['font.sans-serif'] = ['SimHei'] # 用黑体显示中文

plt.rcParams['axes.unicode_minus'] = False

ch0 = repmat(ch0, num, 1)

y_hat = ch0 + x0 @ xish

y1max = y_hat.max(axis=0)

y2max = y0.max(axis=0)

ymax = np.r_[np.array([y1max]), np.array([y2max])].max(axis=0)

for i in range(self.dependent):

print("y{}: R^2 score = {}".format(i + 1, R2_func(y_hat[:, i], self.df.iloc[:, -self.dependent + i])))

plt.subplot(self.dependent, 2, i * 2 + 1)

x = [-1, ceil(ymax[i])]

plt.plot(x, x, '-')

plt.plot(y_hat[:, i], y0[:, i], '*')

plt.title("y{}".format(i + 1))

plt.subplot(self.dependent, 2, i * 2 + 2)

x = np.arange(self.n)

plt.bar(x, height=xishu[:, i].reshape([1, self.n], order='F')[0], width=0.5)

plt.plot([0, self.n], [0, 0], "-")

plt.title("y{}".format(i + 1))

plt.tight_layout()

plt.savefig("verify.jpg")

plt.show()

def save(self, sol):

sol = np.r_[[["y{}".format(i + 1) for i in range(self.dependent)]], sol]

sol = np.c_[["dependent", "x0"] + list(self.df.columns[:-self.dependent]), sol]

print(sol)

with open("result.csv", "w", newline="") as file:

writer = csv.writer(file)

writer.writerows(sol)

def R2_func(y_test, y):

return 1 - ((y_test - y) ** 2).sum() / ((y.mean() - y) ** 2).sum()

if __name__ == '__main__':

# 因变量个数，文件位置，次数

# 文件格式说明：第一行为数据名称（非数据），每一列不能全为0，请自行删除全为0的列

# 文件格式说明：第一列拒绝 index ，前面 m 列为自变量，后面 n 列为因变量

# 文件类型说明：支持 .csv .xlsx .xls

Linear(1, "PLS2.csv", 1)

天气预测源代码

该代码为原创代码，版权归杭州电子科技大学-蹲pycharm的小火龙队，成员：陈志龙、潘一壬、金日翔 2021.5

如果需要请自取并自行改编
全部资料代码下载

其中Linear.py为最小二乘法python代码实现，想做别的最小二乘法项目可以进行参考，代码具体理解及注释可以参考书籍。

题目

城市天气分析

【背景说明】
我国幅员辽阔，跨纬度较广，距海远近差距较大，加之地势高低不同，地形类型及山脉走向多样，因而气温降水的组合多种多样，形成了多种多样的气候。气温，降水，温差，霜冻，风力等等对于人们的生产生活至关重要，天气对我们的影响是多方面的，比如农业生产，南方水稻北方小麦为主，受温度和降水影响；工业生产，风向对于大气污染会有影响，对于工厂选址也有影响，天气对生活的影响不言而喻。
【问题说明】
（1）要求参赛队员通过 http://www.tianqiapi.com/index/doc?version=history 提供的接口，使用爬虫爬取浙江省 11 个城市 2021 年 3 月份天气数据。
（2）对爬取数据进行处理，分别提取出各城市每天最高最低温，绘制其最高温最低温折线图，根据 31 组数据使用线性回归预测 4 月 1 日至 7 日最高温，最低温，最后取这 11所城市的预测结果平均值作为浙江省 4 月 1 日至 7 日最高温最低温预测值。
【提交标准】
提交文件为：
1. 相关数据分析报告（pdf/pdf 各一份）
2. 相关代码文档（包括爬虫代码文档和分析代码文档，压缩包形式上交）
3. 爬取资料文件（压缩包形式上交）
备注：
爬取资料文件如过大，可上交部分资料文件并说明
分析代码建议以 Jupyter Notebook ipynb 文件格式上交

import numpy as np
from openpyxl import Workbook
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from numpy.matlib import repmat
import requests
import time
import json
from Spider import Spider
from Spider_april import Spider_april
from Linear import Linear
from Find_average import Find_average
from Predict import Predict
from Plot import Plot
from Plot_april import Plot_april

Spider()
Spider_april()
Plot()
Linear()
Predict()
Find_average()
Plot_april()

import numpy as np

from openpyxl import Workbook

import matplotlib.pyplot as plt

import pandas as pd

from sklearn import preprocessing

from numpy.matlib import repmat

import requests

import time

import json

from Spider import Spider

from Spider_april import Spider_april

from Linear import Linear

from Find_average import Find_average

from Predict import Predict

from Plot import Plot

from Plot_april import Plot_april

Spider()

Spider_april()

Plot()

Linear()

Predict()

Find_average()

Plot_april()

import requests
from openpyxl import Workbook
import time
import pandas as pd
import json


class Spider():
    def __init__(self):
        # 全局header
        self.headers = {
            'user_agent': 'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'}
        # 网址必要后缀
        self.appid = xxxxx
        self.appsecret = 'xxxxx'
        self.version = 'history'
        self.year = 2021
        self.month = 3
        self.city = '杭州、宁波、温州、绍兴、湖州、嘉兴、金华、衢州、台州、丽水、舟山'.split('、')
        # 全部需要爬取的网址
        self.urls = [
            'https://tianqiapi.com/api?version={}&appid={}&appsecret={}&city={}&year={}&month={}'.format(self.version,
                                                                                                         self.appid,
                                                                                                         self.appsecret,
                                                                                                         i, self.year,
                                                                                                         self.month) for
            i in self.city]
        self.if_file()
        # city可输入为包含任意城市的数组
        self.result = self.spider(self.city)
        self.save_file(self.result)
        self.secondary_treatment()

    # 判断是否已经存在文件，若无则新建程序
    def if_file(self):
        try:
            f = open('Weather.xlsx')
            f.close()
        except IOError:
            self.creat_Excel()

    # 新建文件
    def creat_Excel(self):
        wb = Workbook()
        sheet = wb.active
        sheet.title = 'Weather'
        row = ['City', 'Date', 'Highest', 'Lowest', '天气情况', '风向', '风力', '空气质量', '空气质量描述', '空气质量等级']
        sheet.append(row)
        wb.save('Weather.xlsx')

    # 可输入城市（数组）名称进行爬取
    def spider(self, city_name):
        # 最高温和最低温结果保存
        result = []
        # 全部数据保存以便后续使用
        result_sum = []
        sum = 1
        for i in city_name:
            # 异常处理代码
            if not i in self.city:
                print('City {} not find in Zhejiang province'.format(i))
                exit()
            else:
                print('Now spide {} city'.format(i))
            # 爬取
            res = requests.get(url=self.urls[self.city.index(i)], headers=self.headers)
            # 错误处理代码：若爬取失败每1s爬取一次,每个城市最多再10次重新爬取，一共最多再50次重新爬取
            while res.status_code != 200:
                j = 1
                print('Error, the {} attemp in {} city, together {} attemp'.format(j, i, sum))
                j += 1
                sum += 1
                res = requests.get(url=self.urls[i], headers=self.headers)
                time.sleep(1)
                if j > 10:
                    print('City {} can\'t find, go to next city'.format(i))
                    result_sum.append({'city': i, 'status': None})
                    continue
                if sum > 50:
                    print('Please check your Internet')
                    exit()
            print('City {} success'.format(i))
            # 数据处理
            res = json.loads(res.text)
            result_sum.append(res)
            for k in range(31):
                result.append([i, res['data'][k]['ymd'], res['data'][k]['bWendu'], res['data'][k]['yWendu'],
                               res['data'][k]['tianqi'], res['data'][k]['fengxiang'], res['data'][k]['fengli'],
                               res['data'][k]['aqi'], res['data'][k]['aqiInfo'], res['data'][k]['aqiLevel']])
            time.sleep(5.01)
        print(result_sum)
        return result

    # 保存爬取文件
    def save_file(self, result):
        df = pd.read_excel('Weather.xlsx', sheet_name='Weather', engine='openpyxl')
        for i in result:
            df.loc[len(list(df.index))] = i
        # 去重
        df = self.delete_same(df)
        # 排序
        df.sort_values(['City', 'Date'], ascending=True, inplace=True)
        # 保存
        df.to_excel('Weather.xlsx', index=False, sheet_name='Weather')

    # 可单独调用排序
    def sort_values(self):
        df = pd.read_excel('Weather.xlsx', sheet_name='Weather', engine='openpyxl')
        df.sort_values(['City', 'Date'], ascending=True, inplace=True)
        df.to_excel('Weather.xlsx', index=False, sheet_name='Weather')

    # 去重代码
    def delete_same(self, df):
        sum = 0
        for i in range(len(list(df.index)) - 1):
            for j in range(i + 1, len(list(df.index))):
                for k in ['City', 'Date', 'Highest']:
                    if df[k][i] != df[k][j]:
                        break
                if k == 'Highest':
                    df.drop(j, inplace=True)
                    sum += 1
        print('共去除了{}调重复项目'.format(sum))
        return df

    # 数据二次处理代码（可单独调用）
    def secondary_treatment(self):
        self.if_file_2()
        fengli = ['微']
        fengli += ['%d级' % (i + 1) for i in range(17)]
        fengli_change = [0.1, 0.85, 2.4, 4.35, 6.65, 9.3, 12.25, 15.45, 18.9, 22.55, 26.4, 30.5, 34.75, 39.15, 43.75,
                         48.5, 53.45, None]
        tianqi_situation = ['晴', '多云', '阴', '雾', '小雨', '中雨', '大雨', '暴雨']
        df = pd.read_excel('Weather.xlsx', sheet_name='Weather', engine='openpyxl')
        fengli_col = list(df['风力'])
        tianqi_situation_col = list(df['天气情况'])
        Highest = list(df['Highest'])
        Lowest = list(df['Lowest'])
        for i in range(31 * 11):
            Highest[i] = int(Highest[i][:-1])
            Lowest[i] = int(Lowest[i][:-1])
            fengli_col[i] = fengli_change[fengli.index(df['风力'][i])]
            tianqi_situation_number = 0
            tianqi_situation_sum = 0
            for j in range(8):
                if tianqi_situation[j] in df['天气情况'][i]:
                    tianqi_situation_sum += j
                    tianqi_situation_number += 1
            tianqi_situation_col[i] = tianqi_situation_sum / tianqi_situation_number
        df['风力'] = fengli_col
        df['天气情况'] = tianqi_situation_col
        df['Highest'] = Highest
        df['Lowest'] = Lowest
        df.to_excel('Weather_secondary_treatment.xlsx', index=False, sheet_name='Weather_secondary_treatment')

    # 判断是否已经存在文件，若无则新建程序
    def if_file_2(self):
        try:
            f = open('Weather_secondary_treatment.xlsx')
            f.close()
        except IOError:
            wb = Workbook()
            sheet = wb.active
            sheet.title = 'Weather_secondary_treatment'
            row = ['City', 'Date', 'Highest', 'Lowest', '天气情况', '风向', '风力', '空气质量', '空气质量描述', '空气质量等级']
            sheet.append(row)
            wb.save('Weather_secondary_treatment.xlsx')


if __name__ == '__main__':
    Spider()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

import requests

from openpyxl import Workbook

import time

import pandas as pd

import json

class Spider():

def __init__(self):

# 全局header

self.headers = {

'user_agent': 'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'}

# 网址必要后缀

self.appid = xxxxx

self.appsecret = 'xxxxx'

self.version = 'history'

self.year = 2021

self.month = 3

self.city = '杭州、宁波、温州、绍兴、湖州、嘉兴、金华、衢州、台州、丽水、舟山'.split('、')

# 全部需要爬取的网址

self.urls = [

'https://tianqiapi.com/api?version={}&appid={}&appsecret={}&city={}&year={}&month={}'.format(self.version,

self.appid,

self.appsecret,

i, self.year,

self.month) for

i in self.city]

self.if_file()

# city可输入为包含任意城市的数组

self.result = self.spider(self.city)

self.save_file(self.result)

self.secondary_treatment()

# 判断是否已经存在文件，若无则新建程序

def if_file(self):

try:

f = open('Weather.xlsx')

f.close()

except IOError:

self.creat_Excel()

# 新建文件

def creat_Excel(self):

wb = Workbook()

sheet = wb.active

sheet.title = 'Weather'

row = ['City', 'Date', 'Highest', 'Lowest', '天气情况', '风向', '风力', '空气质量', '空气质量描述', '空气质量等级']

sheet.append(row)

wb.save('Weather.xlsx')

# 可输入城市（数组）名称进行爬取

def spider(self, city_name):

# 最高温和最低温结果保存

result = []

# 全部数据保存以便后续使用

result_sum = []

sum = 1

for i in city_name:

# 异常处理代码

if not i in self.city:

print('City {} not find in Zhejiang province'.format(i))

exit()

else:

print('Now spide {} city'.format(i))

# 爬取

res = requests.get(url=self.urls[self.city.index(i)], headers=self.headers)

# 错误处理代码：若爬取失败每1s爬取一次,每个城市最多再10次重新爬取，一共最多再50次重新爬取

while res.status_code != 200:

j = 1

print('Error, the {} attemp in {} city, together {} attemp'.format(j, i, sum))

j += 1

sum += 1

res = requests.get(url=self.urls[i], headers=self.headers)

time.sleep(1)

if j > 10:

print('City {} can\'t find, go to next city'.format(i))

result_sum.append({'city': i, 'status': None})

continue

if sum > 50:

print('Please check your Internet')

exit()

print('City {} success'.format(i))

# 数据处理

res = json.loads(res.text)

result_sum.append(res)

for k in range(31):

result.append([i, res['data'][k]['ymd'], res['data'][k]['bWendu'], res['data'][k]['yWendu'],

res['data'][k]['tianqi'], res['data'][k]['fengxiang'], res['data'][k]['fengli'],

res['data'][k]['aqi'], res['data'][k]['aqiInfo'], res['data'][k]['aqiLevel']])

time.sleep(5.01)

print(result_sum)

return result

# 保存爬取文件

def save_file(self, result):

df = pd.read_excel('Weather.xlsx', sheet_name='Weather', engine='openpyxl')

for i in result:

df.loc[len(list(df.index))] = i

# 去重

df = self.delete_same(df)

# 排序

df.sort_values(['City', 'Date'], ascending=True, inplace=True)

# 保存

df.to_excel('Weather.xlsx', index=False, sheet_name='Weather')

# 可单独调用排序

def sort_values(self):

df = pd.read_excel('Weather.xlsx', sheet_name='Weather', engine='openpyxl')

df.sort_values(['City', 'Date'], ascending=True, inplace=True)

df.to_excel('Weather.xlsx', index=False, sheet_name='Weather')

# 去重代码

def delete_same(self, df):

sum = 0

for i in range(len(list(df.index)) - 1):

for j in range(i + 1, len(list(df.index))):

for k in ['City', 'Date', 'Highest']:

if df[k][i] != df[k][j]:

break

if k == 'Highest':

df.drop(j, inplace=True)

sum += 1

print('共去除了{}调重复项目'.format(sum))

return df

# 数据二次处理代码（可单独调用）

def secondary_treatment(self):

self.if_file_2()

fengli = ['微']

fengli += ['%d级' % (i + 1) for i in range(17)]

fengli_change = [0.1, 0.85, 2.4, 4.35, 6.65, 9.3, 12.25, 15.45, 18.9, 22.55, 26.4, 30.5, 34.75, 39.15, 43.75,

48.5, 53.45, None]

tianqi_situation = ['晴', '多云', '阴', '雾', '小雨', '中雨', '大雨', '暴雨']

df = pd.read_excel('Weather.xlsx', sheet_name='Weather', engine='openpyxl')

fengli_col = list(df['风力'])

tianqi_situation_col = list(df['天气情况'])

Highest = list(df['Highest'])

Lowest = list(df['Lowest'])

for i in range(31 * 11):

Highest[i] = int(Highest[i][:-1])

Lowest[i] = int(Lowest[i][:-1])

fengli_col[i] = fengli_change[fengli.index(df['风力'][i])]

tianqi_situation_number = 0

tianqi_situation_sum = 0

for j in range(8):

if tianqi_situation[j] in df['天气情况'][i]:

tianqi_situation_sum += j

tianqi_situation_number += 1

tianqi_situation_col[i] = tianqi_situation_sum / tianqi_situation_number

df['风力'] = fengli_col

df['天气情况'] = tianqi_situation_col

df['Highest'] = Highest

df['Lowest'] = Lowest

df.to_excel('Weather_secondary_treatment.xlsx', index=False, sheet_name='Weather_secondary_treatment')

# 判断是否已经存在文件，若无则新建程序

def if_file_2(self):

try:

f = open('Weather_secondary_treatment.xlsx')

f.close()

except IOError:

wb = Workbook()

sheet = wb.active

sheet.title = 'Weather_secondary_treatment'

row = ['City', 'Date', 'Highest', 'Lowest', '天气情况', '风向', '风力', '空气质量', '空气质量描述', '空气质量等级']

sheet.append(row)

wb.save('Weather_secondary_treatment.xlsx')

if __name__ == '__main__':

Spider()

import requests
from openpyxl import Workbook
import time
import pandas as pd
import json

class Spider_april():
    def __init__(self):
        # 全局header
        self.headers = {
            'user_agent': 'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'}
        # 网址必要后缀
        self.appid = xxxxx
        self.appsecret = 'xxxx'
        self.version = 'history'
        self.year = 2021
        self.month = 4
        self.city = '杭州、宁波、温州、绍兴、湖州、嘉兴、金华、衢州、台州、丽水、舟山'.split('、')
        # 全部需要爬取的网址
        self.urls = [
            'https://tianqiapi.com/api?version={}&appid={}&appsecret={}&city={}&year={}&month={}'.format(self.version,
                                                                                                         self.appid,
                                                                                                         self.appsecret,
                                                                                                         i, self.year,
                                                                                                         self.month) for
            i in self.city]
        self.if_file()
        # city可输入为包含任意城市的数组
        self.result = self.spider(self.city)
        self.save_file(self.result)
        self.secondary_treatment()

    # 判断是否已经存在文件，若无则新建程序
    def if_file(self):
        try:
            f = open('.\\4月1-7日\\Weather.xlsx')
            f.close()
        except IOError:
            self.creat_Excel()

    # 新建文件
    def creat_Excel(self):
        wb = Workbook()
        sheet = wb.active
        sheet.title = 'Weather'
        row = ['City', 'Date', 'Highest', 'Lowest', '天气情况', '风向', '风力', '空气质量', '空气质量描述', '空气质量等级']
        sheet.append(row)
        wb.save('.\\4月1-7日\\Weather.xlsx')

    # 可输入城市（数组）名称进行爬取
    def spider(self, city_name):
        # 最高温和最低温结果保存
        result = []
        # 全部数据保存以便后续使用
        result_sum = []
        sum = 1
        for i in city_name:
            # 异常处理代码
            if not i in self.city:
                print('City {} not find in Zhejiang province'.format(i))
                exit()
            else:
                print('Now spide {} city'.format(i))
            # 爬取
            res = requests.get(url=self.urls[self.city.index(i)], headers=self.headers)
            # 错误处理代码：若爬取失败每1s爬取一次,每个城市最多再10次重新爬取，一共最多再50次重新爬取
            while res.status_code != 200:
                j = 1
                print('Error, the {} attemp in {} city, together {} attemp'.format(j, i, sum))
                j += 1
                sum += 1
                res = requests.get(url=self.urls[i], headers=self.headers)
                time.sleep(1)
                if j > 10:
                    print('City {} can\'t find, go to next city'.format(i))
                    result_sum.append({'city': i, 'status': None})
                    continue
                if sum > 50:
                    print('Please check your Internet')
                    exit()
            print('City {} success'.format(i))
            # 数据处理
            res = json.loads(res.text)
            result_sum.append(res)
            for k in range(7):
                result.append([i, res['data'][k]['ymd'], res['data'][k]['bWendu'], res['data'][k]['yWendu'],
                               res['data'][k]['tianqi'], res['data'][k]['fengxiang'], res['data'][k]['fengli'],
                               res['data'][k]['aqi'], res['data'][k]['aqiInfo'], res['data'][k]['aqiLevel']])
            time.sleep(5.01)
        print(result_sum)
        return result

    # 保存爬取文件
    def save_file(self, result):
        df = pd.read_excel('.\\4月1-7日\\Weather.xlsx', sheet_name='Weather', engine='openpyxl')
        for i in result:
            df.loc[len(list(df.index))] = i
        # 去重
        df = self.delete_same(df)
        # 排序
        df.sort_values(['City', 'Date'], ascending=True, inplace=True)
        # 保存
        df.to_excel('.\\4月1-7日\\Weather.xlsx', index=False, sheet_name='Weather')

    # 可单独调用排序
    def sort_values(self):
        df = pd.read_excel('.\\4月1-7日\\Weather.xlsx', sheet_name='Weather', engine='openpyxl')
        df.sort_values(['City', 'Date'], ascending=True, inplace=True)
        df.to_excel('.\\4月1-7日\\Weather.xlsx', index=False, sheet_name='Weather')


    # 去重代码
    def delete_same(self, df):
        sum = 0
        for i in range(len(list(df.index)) - 1):
            for j in range(i + 1, len(list(df.index))):
                for k in ['City', 'Date', 'Highest']:
                    if df[k][i] != df[k][j]:
                        break
                if k == 'Highest':
                    df.drop(j, inplace=True)
                    sum += 1
        print('共去除了{}调重复项目'.format(sum))
        return df


    # 数据二次处理代码（可单独调用）
    def secondary_treatment(self):
        self.if_file_2()
        fengli = ['微']
        fengli += ['%d级' % (i + 1) for i in range(17)]
        fengli_change = [0.1, 0.85, 2.4, 4.35, 6.65, 9.3, 12.25, 15.45, 18.9, 22.55, 26.4, 30.5, 34.75, 39.15, 43.75,
                         48.5, 53.45, None]
        tianqi_situation = ['晴', '多云', '阴', '雾', '小雨', '中雨', '大雨', '暴雨']
        df = pd.read_excel('.\\4月1-7日\\Weather.xlsx', sheet_name='Weather', engine='openpyxl')
        fengli_col = list(df['风力'])
        tianqi_situation_col = list(df['天气情况'])
        Highest = list(df['Highest'])
        Lowest = list(df['Lowest'])
        for i in range(7 * 11):
            Highest[i] = int(Highest[i][:-1])
            Lowest[i] = int(Lowest[i][:-1])
            fengli_col[i] = fengli_change[fengli.index(df['风力'][i])]
            tianqi_situation_number = 0
            tianqi_situation_sum = 0
            for j in range(8):
                if tianqi_situation[j] in df['天气情况'][i]:
                    tianqi_situation_sum += j
                    tianqi_situation_number += 1
            tianqi_situation_col[i] = tianqi_situation_sum / tianqi_situation_number
        df['风力'] = fengli_col
        df['天气情况'] = tianqi_situation_col
        df['Highest'] = Highest
        df['Lowest'] = Lowest
        df.to_excel('.\\4月1-7日\\Weather_secondary_treatment.xlsx', index=False, sheet_name='Weather_secondary_treatment')

    # 判断是否已经存在文件，若无则新建程序
    def if_file_2(self):
        try:
            f = open('.\\4月1-7日\\Weather_secondary_treatment.xlsx')
            f.close()
        except IOError:
            wb = Workbook()
            sheet = wb.active
            sheet.title = 'Weather_secondary_treatment'
            row = ['City', 'Date', 'Highest', 'Lowest', '天气情况', '风向', '风力', '空气质量', '空气质量描述', '空气质量等级']
            sheet.append(row)
            wb.save('.\\4月1-7日\\Weather_secondary_treatment.xlsx')


if __name__ == '__main__':
    Spider_april()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

import requests

from openpyxl import Workbook

import time

import pandas as pd

import json

class Spider_april():

def __init__(self):

# 全局header

self.headers = {

'user_agent': 'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1'}

# 网址必要后缀

self.appid = xxxxx

self.appsecret = 'xxxx'

self.version = 'history'

self.year = 2021

self.month = 4

self.city = '杭州、宁波、温州、绍兴、湖州、嘉兴、金华、衢州、台州、丽水、舟山'.split('、')

# 全部需要爬取的网址

self.urls = [

'https://tianqiapi.com/api?version={}&appid={}&appsecret={}&city={}&year={}&month={}'.format(self.version,

self.appid,

self.appsecret,

i, self.year,

self.month) for

i in self.city]

self.if_file()

# city可输入为包含任意城市的数组

self.result = self.spider(self.city)

self.save_file(self.result)

self.secondary_treatment()

# 判断是否已经存在文件，若无则新建程序

def if_file(self):

try:

f = open('.\\4月1-7日\\Weather.xlsx')

f.close()

except IOError:

self.creat_Excel()

# 新建文件

def creat_Excel(self):

wb = Workbook()

sheet = wb.active

sheet.title = 'Weather'

row = ['City', 'Date', 'Highest', 'Lowest', '天气情况', '风向', '风力', '空气质量', '空气质量描述', '空气质量等级']

sheet.append(row)

wb.save('.\\4月1-7日\\Weather.xlsx')

# 可输入城市（数组）名称进行爬取

def spider(self, city_name):

# 最高温和最低温结果保存

result = []

# 全部数据保存以便后续使用

result_sum = []

sum = 1

for i in city_name:

# 异常处理代码

if not i in self.city:

print('City {} not find in Zhejiang province'.format(i))

exit()

else:

print('Now spide {} city'.format(i))

# 爬取

res = requests.get(url=self.urls[self.city.index(i)], headers=self.headers)

# 错误处理代码：若爬取失败每1s爬取一次,每个城市最多再10次重新爬取，一共最多再50次重新爬取

while res.status_code != 200:

j = 1

print('Error, the {} attemp in {} city, together {} attemp'.format(j, i, sum))

j += 1

sum += 1

res = requests.get(url=self.urls[i], headers=self.headers)

time.sleep(1)

if j > 10:

print('City {} can\'t find, go to next city'.format(i))

result_sum.append({'city': i, 'status': None})

continue

if sum > 50:

print('Please check your Internet')

exit()

print('City {} success'.format(i))

# 数据处理

res = json.loads(res.text)

result_sum.append(res)

for k in range(7):

result.append([i, res['data'][k]['ymd'], res['data'][k]['bWendu'], res['data'][k]['yWendu'],

res['data'][k]['tianqi'], res['data'][k]['fengxiang'], res['data'][k]['fengli'],

res['data'][k]['aqi'], res['data'][k]['aqiInfo'], res['data'][k]['aqiLevel']])

time.sleep(5.01)

print(result_sum)

return result

# 保存爬取文件

def save_file(self, result):

df = pd.read_excel('.\\4月1-7日\\Weather.xlsx', sheet_name='Weather', engine='openpyxl')

for i in result:

df.loc[len(list(df.index))] = i

# 去重

df = self.delete_same(df)

# 排序

df.sort_values(['City', 'Date'], ascending=True, inplace=True)

# 保存

df.to_excel('.\\4月1-7日\\Weather.xlsx', index=False, sheet_name='Weather')

# 可单独调用排序

def sort_values(self):

df = pd.read_excel('.\\4月1-7日\\Weather.xlsx', sheet_name='Weather', engine='openpyxl')

df.sort_values(['City', 'Date'], ascending=True, inplace=True)

df.to_excel('.\\4月1-7日\\Weather.xlsx', index=False, sheet_name='Weather')

# 去重代码

def delete_same(self, df):

sum = 0

for i in range(len(list(df.index)) - 1):

for j in range(i + 1, len(list(df.index))):

for k in ['City', 'Date', 'Highest']:

if df[k][i] != df[k][j]:

break

if k == 'Highest':

df.drop(j, inplace=True)

sum += 1

print('共去除了{}调重复项目'.format(sum))

return df

# 数据二次处理代码（可单独调用）

def secondary_treatment(self):

self.if_file_2()

fengli = ['微']

fengli += ['%d级' % (i + 1) for i in range(17)]

fengli_change = [0.1, 0.85, 2.4, 4.35, 6.65, 9.3, 12.25, 15.45, 18.9, 22.55, 26.4, 30.5, 34.75, 39.15, 43.75,

48.5, 53.45, None]

tianqi_situation = ['晴', '多云', '阴', '雾', '小雨', '中雨', '大雨', '暴雨']

df = pd.read_excel('.\\4月1-7日\\Weather.xlsx', sheet_name='Weather', engine='openpyxl')

fengli_col = list(df['风力'])

tianqi_situation_col = list(df['天气情况'])

Highest = list(df['Highest'])

Lowest = list(df['Lowest'])

for i in range(7 * 11):

Highest[i] = int(Highest[i][:-1])

Lowest[i] = int(Lowest[i][:-1])

fengli_col[i] = fengli_change[fengli.index(df['风力'][i])]

tianqi_situation_number = 0

tianqi_situation_sum = 0

for j in range(8):

if tianqi_situation[j] in df['天气情况'][i]:

tianqi_situation_sum += j

tianqi_situation_number += 1

tianqi_situation_col[i] = tianqi_situation_sum / tianqi_situation_number

df['风力'] = fengli_col

df['天气情况'] = tianqi_situation_col

df['Highest'] = Highest

df['Lowest'] = Lowest

df.to_excel('.\\4月1-7日\\Weather_secondary_treatment.xlsx', index=False, sheet_name='Weather_secondary_treatment')

# 判断是否已经存在文件，若无则新建程序

def if_file_2(self):

try:

f = open('.\\4月1-7日\\Weather_secondary_treatment.xlsx')

f.close()

except IOError:

wb = Workbook()

sheet = wb.active

sheet.title = 'Weather_secondary_treatment'

row = ['City', 'Date', 'Highest', 'Lowest', '天气情况', '风向', '风力', '空气质量', '空气质量描述', '空气质量等级']

sheet.append(row)

wb.save('.\\4月1-7日\\Weather_secondary_treatment.xlsx')

if __name__ == '__main__':

Spider_april()

import pandas as pd
import matplotlib.pyplot as plt


class Plot:
    def __init__(self):
        # 打开爬虫爬好的文件
        self.df = pd.read_excel('Weather.xlsx', sheet_name='Weather', engine='openpyxl')
        # 初始化图表
        plt.rcParams['font.family'] = ['Fangsong']
        plt.rcParams['axes.unicode_minus'] = False
        plt.figure(figsize=(6, 3), dpi=200)
        plt.subplots_adjust(hspace=0.5)
        # 城市名称
        self.city = '杭州、宁波、温州、绍兴、湖州、嘉兴、金华、衢州、台州、丽水、舟山'.split('、')
        self.city.sort()
        # 自定义曲线颜色
        self.color = ['black', 'gray', 'lightcoral', 'red', 'darkgoldenrod', 'yellow', 'green', 'blue', 'cyan',
                      'purple', 'pink']
        for i in self.city:
            self.plot_Weather_Lowest(i)
            self.plot_Weather_Highest(i)
            plt.title('{}最高最低气温'.format(i))
            # 保存图片
            plt.savefig('./{}3月份每天最高气温和最低气温.jpg'.format(i))
            plt.clf()
        for i in self.city:
            self.plot_Weather_Lowest(i)
            self.plot_Weather_Highest(i)
        plt.title('浙江各城市最高最低气温')
        plt.legend(labels=self.city, bbox_to_anchor=(1, 1), fontsize=6.5, framealpha=0)
        plt.savefig('./浙江各个城市3月份每天最高气温和最低气温.jpg')

    # 画出最高温度
    def plot_Weather_Highest(self, city_name):
        # x为横坐标共31天，y为℃
        x = [i + 1 for i in range(31)]
        plt1 = plt.subplot(2, 1, 1)
        y = list(self.df['Highest'][self.city.index(city_name) * 31:(self.city.index(city_name) + 1) * 31])
        for j in range(31):
            y[j] = int(y[j][:-1])
        plt1.plot(x, y, '^-', color=self.color[self.city.index(city_name)], linewidth=0.5, markersize=0.6)
        plt.xticks(x, fontsize=5)
        plt.xlabel('Highest', fontsize=7)
        plt.yticks(fontsize=5)

    # 画出最低温度
    def plot_Weather_Lowest(self, city_name):
        # x为横坐标共31天，y为℃
        x = [i + 1 for i in range(31)]
        plt2 = plt.subplot(2, 1, 2)
        y = list(self.df['Lowest'][self.city.index(city_name) * 31:(self.city.index(city_name) + 1) * 31])
        for j in range(31):
            y[j] = int(y[j][:-1])
        plt2.plot(x, y, 'v-', color=self.color[self.city.index(city_name)], linewidth=0.5, markersize=0.6)
        plt.xticks(x, fontsize=5)
        plt.xlabel('Lowest', fontsize=7)
        plt.yticks(fontsize=5)


if __name__ == '__main__':
    Plot()

import pandas as pd

import matplotlib.pyplot as plt

class Plot:

def __init__(self):

# 打开爬虫爬好的文件

self.df = pd.read_excel('Weather.xlsx', sheet_name='Weather', engine='openpyxl')

# 初始化图表

plt.rcParams['font.family'] = ['Fangsong']

plt.rcParams['axes.unicode_minus'] = False

plt.figure(figsize=(6, 3), dpi=200)

plt.subplots_adjust(hspace=0.5)

# 城市名称

self.city = '杭州、宁波、温州、绍兴、湖州、嘉兴、金华、衢州、台州、丽水、舟山'.split('、')

self.city.sort()

# 自定义曲线颜色

self.color = ['black', 'gray', 'lightcoral', 'red', 'darkgoldenrod', 'yellow', 'green', 'blue', 'cyan',

'purple', 'pink']

for i in self.city:

self.plot_Weather_Lowest(i)

self.plot_Weather_Highest(i)

plt.title('{}最高最低气温'.format(i))

# 保存图片

plt.savefig('./{}3月份每天最高气温和最低气温.jpg'.format(i))

plt.clf()

for i in self.city:

self.plot_Weather_Lowest(i)

self.plot_Weather_Highest(i)

plt.title('浙江各城市最高最低气温')

plt.legend(labels=self.city, bbox_to_anchor=(1, 1), fontsize=6.5, framealpha=0)

plt.savefig('./浙江各个城市3月份每天最高气温和最低气温.jpg')

# 画出最高温度

def plot_Weather_Highest(self, city_name):

# x为横坐标共31天，y为℃

x = [i + 1 for i in range(31)]

plt1 = plt.subplot(2, 1, 1)

y = list(self.df['Highest'][self.city.index(city_name) * 31:(self.city.index(city_name) + 1) * 31])

for j in range(31):

y[j] = int(y[j][:-1])

plt1.plot(x, y, '^-', color=self.color[self.city.index(city_name)], linewidth=0.5, markersize=0.6)

plt.xticks(x, fontsize=5)

plt.xlabel('Highest', fontsize=7)

plt.yticks(fontsize=5)

# 画出最低温度

def plot_Weather_Lowest(self, city_name):

# x为横坐标共31天，y为℃

x = [i + 1 for i in range(31)]

plt2 = plt.subplot(2, 1, 2)

y = list(self.df['Lowest'][self.city.index(city_name) * 31:(self.city.index(city_name) + 1) * 31])

for j in range(31):

y[j] = int(y[j][:-1])

plt2.plot(x, y, 'v-', color=self.color[self.city.index(city_name)], linewidth=0.5, markersize=0.6)

plt.xticks(x, fontsize=5)

plt.xlabel('Lowest', fontsize=7)

plt.yticks(fontsize=5)

if __name__ == '__main__':

Plot()

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import preprocessing
from numpy.matlib import repmat
from openpyxl import Workbook


class Linear():
    def __init__(self):
        self.df = pd.read_excel('Weather_secondary_treatment.xlsx', sheet_name='Weather_secondary_treatment',
                                engine='openpyxl')
        self.city = '杭州、宁波、温州、绍兴、湖州、嘉兴、金华、衢州、台州、丽水、舟山'.split('、')
        self.city.sort()
        plt.rcParams['font.sans-serif'] = ['SimHei']  # 用黑体显示中文
        plt.rcParams['axes.unicode_minus'] = False
        self.f = open('result.txt', mode='w')

        self.if_file()
        self.df_r = pd.read_excel('Result.xlsx', sheet_name='result', engine='openpyxl')
        for city_name in self.city:
            plt.clf()
            x0, y0, num, xishu, ch0, xish, sol = self.find(city_name=city_name)
            self.save(sol=sol, city_name=city_name)
            self.PLOT(ch0=ch0, num=num, x0=x0, y0=y0, xishu=xishu, xish=xish)
            plt.savefig('{}.jpg'.format(city_name))
        self.df_r.to_excel('Result.xlsx', index=False, sheet_name='result')

    def find(self, city_name):
        i = self.city.index(city_name)
        Highest = list(self.df['Highest'][i * 31:i * 31 + 31])
        Lowest = list(self.df['Lowest'][i * 31:i * 31 + 31])
        tianqi_situation = list(self.df['天气情况'][i * 31:i * 31 + 31])
        fengli = list(self.df['风力'][i * 31:i * 31 + 31])
        air_quality = list(self.df['空气质量'][i * 31:i * 31 + 31])
        city_dataframe = pd.DataFrame(
            {'tianqi_situation': tianqi_situation, 'fengli': fengli, "air_quality": air_quality, 'Highest': Highest,
             'Lowest': Lowest})
        city_matrix = np.array([tianqi_situation, fengli, air_quality, Highest, Lowest])
        city_matrix = city_matrix.T
        # 平均值
        mu = np.mean(city_matrix, axis=0)
        # 标准差
        sig = np.std(city_matrix, axis=0)
        rr = city_dataframe.corr()
        data = preprocessing.scale(city_matrix)
        x0 = city_matrix[:, :3]
        y0 = city_matrix[:, 3:]
        e0 = data[:, :3]
        f0 = data[:, 3:]
        num = 31
        n = 3
        m = 2
        # 3阶单位阵
        chg = np.identity(n)
        w = np.zeros([n, n])
        w_star = np.zeros([n, n])
        t = np.zeros([num, n])
        ss = []
        Q_h2 = []
        press_i = [0 for i in range(num)]
        press = [0, 0, 0]
        alpha = np.zeros([3, 1])
        for i in range(n):
            press_i = list(press_i)
            # 点乘
            matrix = e0.T @ f0 @ f0.T @ e0
            # 求特征值
            [val, vec] = np.linalg.eig(matrix)
            val = val.argsort()
            w[:, i] = vec[:, val[len(val) - 1]]
            w_star[:, i] = chg @ w[:, i]
            t[:, i] = e0 @ w[:, i]
            alpha = np.array([list(e0.T @ t[:, i] / (t[:, i].T @ t[:, i]))])
            w1 = np.array(w[:, i:i+1])
            chg = chg @ (np.identity(n) - w1 @ alpha)
            t1 = np.array(t[:, i:i+1])
            e0 = e0 - t1 @ alpha
            beta = np.linalg.pinv(np.c_[t[:, :i + 1], np.ones(31)]) @ f0
            beta = np.delete(beta, (-1), axis=0)
            cancha = f0 - t[:, :i + 1] @ beta
            cancha = [[cancha[i][j] ** 2 for j in range(len(cancha[i]))] for i in range(len(cancha))]
            cancha = np.array(cancha)
            ss.append(cancha.sum())

            for j in range(num):
                t1 = t[:, :i + 1]
                f1 = f0
                she_t = t1[j:j + 1, :]
                she_f = f1[j:j + 1, :]
                t1 = np.delete(t1, j, axis=0)
                f1 = np.delete(f1, j, axis=0)
                beta1 = np.linalg.pinv(np.c_[t1, np.ones(num - 1)]) @ f1
                beta1 = np.delete(beta1, (-1), axis=0)
                cancha = she_f - she_t @ beta1
                cancha = [[cancha[i][j] ** 2 for j in range(len(cancha[i]))] for i in range(len(cancha))]
                cancha = np.array(cancha)
                press_i[j] = cancha.sum()

            press_i = np.array(press_i)
            press[i] = press_i.sum()

            if i > 0:
                Q_h2.append(1 - press[i] / ss[i - 1])
                # print('Q_h2[{}] = {}'.format(i, (1 - press[i] / ss[i - 1])))
            else:
                Q_h2.append(1)
            if Q_h2[i] < 0.0975:
                # print('Number of components proposedr = %d' % (i + 1))
                r = i
                break
        beta_z = np.linalg.pinv(np.c_[t[:, :r + 1], np.ones(31)]) @ f0
        beta_z = np.delete(beta_z, (-1), axis=0)
        xishu = w_star[:, :r + 1] @ beta_z
        mu_x = mu[:n]
        mu_y = mu[n:]
        sig_x = sig[:n]
        sig_y = sig[n:]
        ch0 = []
        for i in range(m):
            ch0.append(float(mu_y[i] - np.true_divide(mu_x, sig_x) * sig_y[i] @ xishu[:, i:i + 1]))
        xish = np.zeros([3, 2])
        for i in range(m):
            xish[:, i] = np.true_divide(xishu[:, i], sig_x.T) * sig_y[i]
        sol = np.r_[np.array([ch0]), xish]

        # print(' city = {}\n x0 = {}\n y0 = {}\n num = {}\n xishu = {}\n ch0 = {}\n xish = {}\n sol = {}'.format(city, x0, y0, num, xishu, ch0, xish, sol))
        return x0, y0, num, xishu, ch0, xish, sol

    def PLOT(self, ch0, num, x0, y0, xishu, xish):
        ch0 = repmat(ch0, num, 1)
        y_hat = ch0 + x0 @ xish
        y1max = y_hat.max(axis=0)
        y2max = y0.max(axis=0)
        ymax = np.r_[np.array([y1max]), np.array([y2max])].max(axis=0)
        cancha = y_hat - y0
        ax1 = plt.subplot(221)
        ax2 = plt.subplot(222)
        ax3 = plt.subplot(223)
        plt.sca(ax1)
        x = [i for i in range(int(ymax[0]))]
        y = [i for i in range(int(ymax[0]))]
        plt.plot(x, y, '-')
        plt.plot(y_hat[:, 0], y0[:, 0], '*')
        plt.sca(ax2)
        x = [i for i in range(int(ymax[1]))]
        y = [i for i in range(int(ymax[1]))]
        plt.plot(x, y, '-')
        plt.plot(y_hat[:, 1], y0[:, 1], 'o')
        plt.sca(ax3)
        x = np.arange(6)
        plt.bar(x, height=xishu.reshape([1, 6], order='F')[0], width=0.5)
        # plt.show()

    def if_file(self):
        try:
            f = open('Result.xlsx')
            f.close()
        except IOError:
            wb = Workbook()
            sheet = wb.active
            sheet.title = 'result'
            row = ['City', 'x0', 'x1', 'x2', 'x3']
            sheet.append(row)
            wb.save('Result.xlsx')

    def save(self, sol, city_name):
        self.f.write('{}\n y1 = {} {}x1 {}x2 {}x3\ny2 = {} {}x1 {}x2 {}x3\n\n'.format(city_name,
                                                                                      sol[0][0] if sol[0][
                                                                                                       0] < 0 else '+' + str(
                                                                                          sol[0][0]),
                                                                                      sol[1][0] if sol[1][
                                                                                                       0] < 0 else '+' + str(
                                                                                          sol[1][0]),
                                                                                      sol[2][0] if sol[2][
                                                                                                       0] < 0 else '+' + str(
                                                                                          sol[2][0]),
                                                                                      sol[3][0] if sol[3][
                                                                                                       0] < 0 else '+' + str(
                                                                                          sol[3][0]),
                                                                                      sol[0][1] if sol[0][
                                                                                                       1] < 0 else '+' + str(
                                                                                          sol[0][1]),
                                                                                      sol[1][1] if sol[1][
                                                                                                       1] < 0 else '+' + str(
                                                                                          sol[1][1]),
                                                                                      sol[2][1] if sol[2][
                                                                                                       1] < 0 else '+' + str(
                                                                                          sol[2][1]),
                                                                                      sol[3][1] if sol[3][
                                                                                                       1] < 0 else '+' + str(
                                                                                          sol[3][1])))
        row = [city_name, sol[0][0], sol[1][0], sol[2][0], sol[3][0]]
        self.df_r.loc[len(list(self.df_r.index))] = row
        row = [city_name, sol[0][1], sol[1][1], sol[2][1], sol[3][1]]
        self.df_r.loc[len(list(self.df_r.index))] = row


if __name__ == '__main__':
    Linear()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

import matplotlib.pyplot as plt

import pandas as pd

import numpy as np

from sklearn import preprocessing

from numpy.matlib import repmat

from openpyxl import Workbook

class Linear():

def __init__(self):

self.df = pd.read_excel('Weather_secondary_treatment.xlsx', sheet_name='Weather_secondary_treatment',

engine='openpyxl')

self.city = '杭州、宁波、温州、绍兴、湖州、嘉兴、金华、衢州、台州、丽水、舟山'.split('、')

self.city.sort()

plt.rcParams['font.sans-serif'] = ['SimHei'] # 用黑体显示中文

plt.rcParams['axes.unicode_minus'] = False

self.f = open('result.txt', mode='w')

self.if_file()

self.df_r = pd.read_excel('Result.xlsx', sheet_name='result', engine='openpyxl')

for city_name in self.city:

plt.clf()

x0, y0, num, xishu, ch0, xish, sol = self.find(city_name=city_name)

self.save(sol=sol, city_name=city_name)

self.PLOT(ch0=ch0, num=num, x0=x0, y0=y0, xishu=xishu, xish=xish)

plt.savefig('{}.jpg'.format(city_name))

self.df_r.to_excel('Result.xlsx', index=False, sheet_name='result')

def find(self, city_name):

i = self.city.index(city_name)

Highest = list(self.df['Highest'][i * 31:i * 31 + 31])

Lowest = list(self.df['Lowest'][i * 31:i * 31 + 31])

tianqi_situation = list(self.df['天气情况'][i * 31:i * 31 + 31])

fengli = list(self.df['风力'][i * 31:i * 31 + 31])

air_quality = list(self.df['空气质量'][i * 31:i * 31 + 31])

city_dataframe = pd.DataFrame(

{'tianqi_situation': tianqi_situation, 'fengli': fengli, "air_quality": air_quality, 'Highest': Highest,

'Lowest': Lowest})

city_matrix = np.array([tianqi_situation, fengli, air_quality, Highest, Lowest])

city_matrix = city_matrix.T

# 平均值

mu = np.mean(city_matrix, axis=0)

# 标准差

sig = np.std(city_matrix, axis=0)

rr = city_dataframe.corr()

data = preprocessing.scale(city_matrix)

x0 = city_matrix[:, :3]

y0 = city_matrix[:, 3:]

e0 = data[:, :3]

f0 = data[:, 3:]

num = 31

n = 3

m = 2

# 3阶单位阵

chg = np.identity(n)

w = np.zeros([n, n])

w_star = np.zeros([n, n])

t = np.zeros([num, n])

ss = []

Q_h2 = []

press_i = [0 for i in range(num)]

press = [0, 0, 0]

alpha = np.zeros([3, 1])

for i in range(n):

press_i = list(press_i)

# 点乘

matrix = e0.T @ f0 @ f0.T @ e0

# 求特征值

[val, vec] = np.linalg.eig(matrix)

val = val.argsort()

w[:, i] = vec[:, val[len(val) - 1]]

w_star[:, i] = chg @ w[:, i]

t[:, i] = e0 @ w[:, i]

alpha = np.array([list(e0.T @ t[:, i] / (t[:, i].T @ t[:, i]))])

w1 = np.array(w[:, i:i+1])

chg = chg @ (np.identity(n) - w1 @ alpha)

t1 = np.array(t[:, i:i+1])

e0 = e0 - t1 @ alpha

beta = np.linalg.pinv(np.c_[t[:, :i + 1], np.ones(31)]) @ f0

beta = np.delete(beta, (-1), axis=0)

cancha = f0 - t[:, :i + 1] @ beta

cancha = [[cancha[i][j] ** 2 for j in range(len(cancha[i]))] for i in range(len(cancha))]

cancha = np.array(cancha)

ss.append(cancha.sum())

for j in range(num):

t1 = t[:, :i + 1]

f1 = f0

she_t = t1[j:j + 1, :]

she_f = f1[j:j + 1, :]

t1 = np.delete(t1, j, axis=0)

f1 = np.delete(f1, j, axis=0)

beta1 = np.linalg.pinv(np.c_[t1, np.ones(num - 1)]) @ f1

beta1 = np.delete(beta1, (-1), axis=0)

cancha = she_f - she_t @ beta1

cancha = [[cancha[i][j] ** 2 for j in range(len(cancha[i]))] for i in range(len(cancha))]

cancha = np.array(cancha)

press_i[j] = cancha.sum()

press_i = np.array(press_i)

press[i] = press_i.sum()

if i > 0:

Q_h2.append(1 - press[i] / ss[i - 1])

# print('Q_h2[{}] = {}'.format(i, (1 - press[i] / ss[i - 1])))

else:

Q_h2.append(1)

if Q_h2[i] < 0.0975:

# print('Number of components proposedr = %d' % (i + 1))

r = i

break

beta_z = np.linalg.pinv(np.c_[t[:, :r + 1], np.ones(31)]) @ f0

beta_z = np.delete(beta_z, (-1), axis=0)

xishu = w_star[:, :r + 1] @ beta_z

mu_x = mu[:n]

mu_y = mu[n:]

sig_x = sig[:n]

sig_y = sig[n:]

ch0 = []

for i in range(m):

ch0.append(float(mu_y[i] - np.true_divide(mu_x, sig_x) * sig_y[i] @ xishu[:, i:i + 1]))

xish = np.zeros([3, 2])

for i in range(m):

xish[:, i] = np.true_divide(xishu[:, i], sig_x.T) * sig_y[i]

sol = np.r_[np.array([ch0]), xish]

# print(' city = {}\n x0 = {}\n y0 = {}\n num = {}\n xishu = {}\n ch0 = {}\n xish = {}\n sol = {}'.format(city, x0, y0, num, xishu, ch0, xish, sol))

return x0, y0, num, xishu, ch0, xish, sol

def PLOT(self, ch0, num, x0, y0, xishu, xish):

ch0 = repmat(ch0, num, 1)

y_hat = ch0 + x0 @ xish

y1max = y_hat.max(axis=0)

y2max = y0.max(axis=0)

ymax = np.r_[np.array([y1max]), np.array([y2max])].max(axis=0)

cancha = y_hat - y0

ax1 = plt.subplot(221)

ax2 = plt.subplot(222)

ax3 = plt.subplot(223)

plt.sca(ax1)

x = [i for i in range(int(ymax[0]))]

y = [i for i in range(int(ymax[0]))]

plt.plot(x, y, '-')

plt.plot(y_hat[:, 0], y0[:, 0], '*')

plt.sca(ax2)

x = [i for i in range(int(ymax[1]))]

y = [i for i in range(int(ymax[1]))]

plt.plot(x, y, '-')

plt.plot(y_hat[:, 1], y0[:, 1], 'o')

plt.sca(ax3)

x = np.arange(6)

plt.bar(x, height=xishu.reshape([1, 6], order='F')[0], width=0.5)

# plt.show()

def if_file(self):

try:

f = open('Result.xlsx')

f.close()

except IOError:

wb = Workbook()

sheet = wb.active

sheet.title = 'result'

row = ['City', 'x0', 'x1', 'x2', 'x3']

sheet.append(row)

wb.save('Result.xlsx')

def save(self, sol, city_name):

self.f.write('{}\n y1 = {} {}x1 {}x2 {}x3\ny2 = {} {}x1 {}x2 {}x3\n\n'.format(city_name,

sol[0][0] if sol[0][

0] < 0 else '+' + str(

sol[0][0]),

sol[1][0] if sol[1][

0] < 0 else '+' + str(

sol[1][0]),

sol[2][0] if sol[2][

0] < 0 else '+' + str(

sol[2][0]),

sol[3][0] if sol[3][

0] < 0 else '+' + str(

sol[3][0]),

sol[0][1] if sol[0][

1] < 0 else '+' + str(

sol[0][1]),

sol[1][1] if sol[1][

1] < 0 else '+' + str(

sol[1][1]),

sol[2][1] if sol[2][

1] < 0 else '+' + str(

sol[2][1]),

sol[3][1] if sol[3][

1] < 0 else '+' + str(

sol[3][1])))

row = [city_name, sol[0][0], sol[1][0], sol[2][0], sol[3][0]]

self.df_r.loc[len(list(self.df_r.index))] = row

row = [city_name, sol[0][1], sol[1][1], sol[2][1], sol[3][1]]

self.df_r.loc[len(list(self.df_r.index))] = row

if __name__ == '__main__':

Linear()

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from openpyxl import Workbook


class Predict(object):
    def __init__(self):
        self.df = pd.read_excel('.\\4月1-7日\\Weather_secondary_treatment.xlsx', sheet_name='Weather_secondary_treatment',
                                engine='openpyxl')
        self.df_r = pd.read_excel('Result.xlsx', sheet_name='result', engine='openpyxl')
        self.city = '杭州、宁波、温州、绍兴、湖州、嘉兴、金华、衢州、台州、丽水、舟山'.split('、')
        self.city.sort()
        self.if_file()
        self.df_r_a = pd.read_excel('Result_April.xlsx', sheet_name='result', engine='openpyxl')
        for i in range(len(list(self.df.index))):
            j = self.city.index(self.df['City'][i])
            j = j * 2
            self.predict_high = self.df_r['x0'][j] + self.df_r['x1'][j] * self.df['天气情况'][i] + self.df_r['x2'][j] * \
                                self.df['风力'][i] + self.df_r['x3'][j] * \
                                self.df['空气质量'][i]
            self.predict_low = self.df_r['x0'][j + 1] + self.df_r['x1'][j + 1] * self.df['天气情况'][i] + self.df_r['x2'][
                j + 1] * self.df['风力'][i] + \
                               self.df_r['x3'][j + 1] * self.df['空气质量'][i]
            row = [self.df['City'][i], self.df['Date'][i], self.df['天气情况'][i], self.df['风力'][i], self.df['空气质量'][i],
                   self.df['Highest'][i],
                   self.df['Lowest'][i], self.predict_high, self.predict_low]
            self.df_r_a.loc[len(list(self.df_r_a.index))] = row
        self.df_r_a.to_excel('Result_April.xlsx', index=False, sheet_name='result')

    def if_file(self):
        try:
            f = open('Result_April.xlsx')
            f.close()
        except IOError:
            wb = Workbook()
            sheet = wb.active
            sheet.title = 'result'
            row = ['City', '时间', '天气情况', '风力', '空气质量', '实际最高温', '实际最低温', '预测最高温', '预测最低温']
            sheet.append(row)
            wb.save('Result_April.xlsx')


if __name__ == '__main__':
    Predict()

import matplotlib.pyplot as plt

import pandas as pd

import numpy as np

from openpyxl import Workbook

class Predict(object):

def __init__(self):

self.df = pd.read_excel('.\\4月1-7日\\Weather_secondary_treatment.xlsx', sheet_name='Weather_secondary_treatment',

engine='openpyxl')

self.df_r = pd.read_excel('Result.xlsx', sheet_name='result', engine='openpyxl')

self.city = '杭州、宁波、温州、绍兴、湖州、嘉兴、金华、衢州、台州、丽水、舟山'.split('、')

self.city.sort()

self.if_file()

self.df_r_a = pd.read_excel('Result_April.xlsx', sheet_name='result', engine='openpyxl')

for i in range(len(list(self.df.index))):

j = self.city.index(self.df['City'][i])

j = j * 2

self.predict_high = self.df_r['x0'][j] + self.df_r['x1'][j] * self.df['天气情况'][i] + self.df_r['x2'][j] * \

self.df['风力'][i] + self.df_r['x3'][j] * \

self.df['空气质量'][i]

self.predict_low = self.df_r['x0'][j + 1] + self.df_r['x1'][j + 1] * self.df['天气情况'][i] + self.df_r['x2'][

j + 1] * self.df['风力'][i] + \

self.df_r['x3'][j + 1] * self.df['空气质量'][i]

row = [self.df['City'][i], self.df['Date'][i], self.df['天气情况'][i], self.df['风力'][i], self.df['空气质量'][i],

self.df['Highest'][i],

self.df['Lowest'][i], self.predict_high, self.predict_low]

self.df_r_a.loc[len(list(self.df_r_a.index))] = row

self.df_r_a.to_excel('Result_April.xlsx', index=False, sheet_name='result')

def if_file(self):

try:

f = open('Result_April.xlsx')

f.close()

except IOError:

wb = Workbook()

sheet = wb.active

sheet.title = 'result'

row = ['City', '时间', '天气情况', '风力', '空气质量', '实际最高温', '实际最低温', '预测最高温', '预测最低温']

sheet.append(row)

wb.save('Result_April.xlsx')

if __name__ == '__main__':

Predict()

import pandas as pd
import numpy as np


class Find_average():
    def __init__(self):
        self.df = pd.read_excel('Result_April.xlsx', sheet_name='result', engine='openpyxl')
        f = open('result_april.txt', 'w')
        self.predict_high = np.array([[self.df['预测最高温'][i * 7 + j] for i in range(11)] for j in range(7)]).sum(
            axis=1) / 11
        self.predict_low = np.array([[self.df['预测最低温'][i * 7 + j] for i in range(11)] for j in range(7)]).sum(
            axis=1) / 11
        self.true_high = np.array([[self.df['实际最高温'][i * 7 + j] for i in range(11)] for j in range(7)]).sum(axis=1) / 11
        self.true_low = np.array([[self.df['实际最低温'][i * 7 + j] for i in range(11)] for j in range(7)]).sum(axis=1) / 11
        print(self.predict_high)
        print(self.predict_low)
        print(self.true_high)
        print(self.true_low)
        f.write('Predict_high\n')
        for i in range(7):
            f.write('4月{}日 = {}\n'.format(i + 1, self.predict_high[i]))
        f.write('Predict_low\n')
        for i in range(7):
            f.write('4月{}日 = {}\n'.format(i + 1, self.predict_low[i]))
        f.write('True_high\n')
        for i in range(7):
            f.write('4月{}日 = {}\n'.format(i + 1, self.true_high[i]))
        f.write('True_low\n')
        for i in range(7):
            f.write('4月{}日 = {}\n'.format(i + 1, self.true_low[i]))


if __name__ == '__main__':
    Find_average()

import pandas as pd

import numpy as np

class Find_average():

def __init__(self):

self.df = pd.read_excel('Result_April.xlsx', sheet_name='result', engine='openpyxl')

f = open('result_april.txt', 'w')

self.predict_high = np.array([[self.df['预测最高温'][i * 7 + j] for i in range(11)] for j in range(7)]).sum(

axis=1) / 11

self.predict_low = np.array([[self.df['预测最低温'][i * 7 + j] for i in range(11)] for j in range(7)]).sum(

axis=1) / 11

self.true_high = np.array([[self.df['实际最高温'][i * 7 + j] for i in range(11)] for j in range(7)]).sum(axis=1) / 11

self.true_low = np.array([[self.df['实际最低温'][i * 7 + j] for i in range(11)] for j in range(7)]).sum(axis=1) / 11

print(self.predict_high)

print(self.predict_low)

print(self.true_high)

print(self.true_low)

f.write('Predict_high\n')

for i in range(7):

f.write('4月{}日 = {}\n'.format(i + 1, self.predict_high[i]))

f.write('Predict_low\n')

for i in range(7):

f.write('4月{}日 = {}\n'.format(i + 1, self.predict_low[i]))

f.write('True_high\n')

for i in range(7):

f.write('4月{}日 = {}\n'.format(i + 1, self.true_high[i]))

f.write('True_low\n')

for i in range(7):

f.write('4月{}日 = {}\n'.format(i + 1, self.true_low[i]))

if __name__ == '__main__':

Find_average()

import pandas as pd
import matplotlib.pyplot as plt


class Plot_april():
    def __init__(self):
        # 打开爬虫爬好的文件
        self.df = pd.read_excel('Result_april.xlsx', sheet_name='result', engine='openpyxl')
        # 初始化图表
        plt.rcParams['font.family'] = ['Fangsong']
        plt.rcParams['axes.unicode_minus'] = False
        plt.figure(figsize=(6, 3), dpi=200)
        plt.subplots_adjust(hspace=0.5)
        # 城市名称
        self.city = '杭州、宁波、温州、绍兴、湖州、嘉兴、金华、衢州、台州、丽水、舟山'.split('、')
        self.city.sort()
        # 自定义曲线颜色
        self.color = ['black', 'gray', 'lightcoral', 'red', 'darkgoldenrod', 'yellow', 'green', 'blue', 'cyan', 'purple', 'pink']
        for i in self.city:
            self.plot_Weather_Lowest(i)
            self.plot_Weather_Highest(i)
            plt.title('{}最高最低气温'.format(i))
            # 保存图片
            plt.savefig('./{}4月份每天最高气温和最低气温.jpg'.format(i))
            plt.clf()


    # 画出最高温度
    def plot_Weather_Highest(self,city_name):
        # x为横坐标共7天，y为℃
        x = [i + 1 for i in range(7)]
        plt1 = plt.subplot(2, 1, 1)
        plt.sca(plt1)
        y = list(self.df['实际最高温'][self.city.index(city_name) * 7:(self.city.index(city_name) + 1) * 7])
        plt1.plot(x, y, '-', color=self.color[self.city.index(city_name)], linewidth=0.5, markersize=0.6)
        y = list(self.df['预测最高温'][self.city.index(city_name) * 7:(self.city.index(city_name) + 1) * 7])
        plt1.plot(x, y, '--', color=self.color[self.city.index(city_name)], linewidth=0.5, markersize=0.6)
        plt.ylim(0, 50)
        plt.xticks(x, fontsize=5)
        plt.xlabel('Highest', fontsize=7)
        plt.yticks(fontsize=5)


    # 画出最低温度
    def plot_Weather_Lowest(self,city_name):
        # x为横坐标共31天，y为℃
        x = [i + 1 for i in range(7)]
        plt2 = plt.subplot(2, 1, 2)
        plt.sca(plt2)
        y = list(self.df['实际最低温'][self.city.index(city_name) * 7:(self.city.index(city_name) + 1) * 7])
        plt2.plot(x, y, '-', color=self.color[self.city.index(city_name)], linewidth=0.5, markersize=0.6)
        y = list(self.df['预测最低温'][self.city.index(city_name) * 7:(self.city.index(city_name) + 1) * 7])
        plt2.plot(x, y, '--', color=self.color[self.city.index(city_name)], linewidth=0.5, markersize=0.6)
        plt.ylim(0, 50)
        plt.xticks(x, fontsize=5)
        plt.xlabel('Lowest', fontsize=7)
        plt.yticks(fontsize=5)


if __name__ == '__main__':
    Plot_april()

import pandas as pd

import matplotlib.pyplot as plt

class Plot_april():

def __init__(self):

# 打开爬虫爬好的文件

self.df = pd.read_excel('Result_april.xlsx', sheet_name='result', engine='openpyxl')

# 初始化图表

plt.rcParams['font.family'] = ['Fangsong']

plt.rcParams['axes.unicode_minus'] = False

plt.figure(figsize=(6, 3), dpi=200)

plt.subplots_adjust(hspace=0.5)

# 城市名称

self.city = '杭州、宁波、温州、绍兴、湖州、嘉兴、金华、衢州、台州、丽水、舟山'.split('、')

self.city.sort()

# 自定义曲线颜色

self.color = ['black', 'gray', 'lightcoral', 'red', 'darkgoldenrod', 'yellow', 'green', 'blue', 'cyan', 'purple', 'pink']

for i in self.city:

self.plot_Weather_Lowest(i)

self.plot_Weather_Highest(i)

plt.title('{}最高最低气温'.format(i))

# 保存图片

plt.savefig('./{}4月份每天最高气温和最低气温.jpg'.format(i))

plt.clf()

# 画出最高温度

def plot_Weather_Highest(self,city_name):

# x为横坐标共7天，y为℃

x = [i + 1 for i in range(7)]

plt1 = plt.subplot(2, 1, 1)

plt.sca(plt1)

y = list(self.df['实际最高温'][self.city.index(city_name) * 7:(self.city.index(city_name) + 1) * 7])

plt1.plot(x, y, '-', color=self.color[self.city.index(city_name)], linewidth=0.5, markersize=0.6)

y = list(self.df['预测最高温'][self.city.index(city_name) * 7:(self.city.index(city_name) + 1) * 7])

plt1.plot(x, y, '--', color=self.color[self.city.index(city_name)], linewidth=0.5, markersize=0.6)

plt.ylim(0, 50)

plt.xticks(x, fontsize=5)

plt.xlabel('Highest', fontsize=7)

plt.yticks(fontsize=5)

# 画出最低温度

def plot_Weather_Lowest(self,city_name):

# x为横坐标共31天，y为℃

x = [i + 1 for i in range(7)]

plt2 = plt.subplot(2, 1, 2)

plt.sca(plt2)

y = list(self.df['实际最低温'][self.city.index(city_name) * 7:(self.city.index(city_name) + 1) * 7])

plt2.plot(x, y, '-', color=self.color[self.city.index(city_name)], linewidth=0.5, markersize=0.6)

y = list(self.df['预测最低温'][self.city.index(city_name) * 7:(self.city.index(city_name) + 1) * 7])

plt2.plot(x, y, '--', color=self.color[self.city.index(city_name)], linewidth=0.5, markersize=0.6)

plt.ylim(0, 50)

plt.xticks(x, fontsize=5)

plt.xlabel('Lowest', fontsize=7)

plt.yticks(fontsize=5)

if __name__ == '__main__':

Plot_april()

Python程序设计大赛

该专题意在帮助想参加python程序设计大赛的同学，并且会分享我在大一时取得的python程序设计大赛一等奖作品。本专题默认读者已经学会python最基础语法以及常用库：爬虫（requests等）、numpy、pandas、matplotlib、sklearn（该库可以只做了解，要用特定算法时再进行调用）。以下都是我个人简介，不喜勿喷。我会从下列五点进行讲解：

选题
题目分析及选择算法
程序框架思路
编程及规范
报告：需要说明算法准确性、优点、缺点等

1. 选题

题目一般会有4道题，内容涉及：爬虫，人工智能深度学习，算法（分类，预测等）。选择自己最有把握的！虽然给你的比赛时间可能很长，但不推荐试图尝试一些新的算法与不熟悉的库，赛后你可以去学习有关此类的知识，但比赛建议从已有知识出发。

2. 题目分析及选择算法

python语法简单，基本上不存在你写不出来或者bug改不出来的问题，关键问题更在于对题目的分析及算法选择。我会从我大一参加的python程序设计大赛题目进行举例说明。

题目

城市天气分析

题目分析：
1. 爬虫。该爬虫相对简单，只需要调用api，并不需要运用BeautifulSoup等进行查找，网址也不用进行变换，也不需要用scrapy这类高级的框架，5s一次下限爬取速度也限制了一些可能性，当然你想写也是可以的，有点大材小用了，并且更麻烦了，老师也不一定想看你这个框架的代码。
2. 队伍区分度。该题目爬虫和画图一定是所有选这道题的队伍能写出来的，但是区分度就在于如何进行预测。大多是队伍是直接拿每日最高、最低气温直接进行线性拟合，我想作为读者的你第一反应可能也是这样，这就没了区分度。你可以对题目进行加深，不要单单局限于题目，题目可能出的不是很好，但是你要想出更好的预测、解决办法。天气预测，要从多个变量共同得出，不能单单只是温度曲线的拟合，更多的是每个变量对其的影响，这时你就要找到影响天气的变量，并进行数据预处理。我们当时考虑的因素有：[‘天气情况’, ‘风力’, ‘空气质量’]，但是遗憾的是我们没有考虑一些海拔、临海距离等因素。
3. 数据预处理。找到影响因素后如何进行数据预处理是一个关键因素，数据预处理在数学建模中也是很关键的一环。很多人可能对如何把文字转换成数字有疑问。最简单的处理就是风力，可以参考国家制定的风力对应表，但天气情况；[‘晴’,’大风’….]就很难处理，这时就需要你自己设置数字，把这些天气编程[1,2…]数字等级。很多人可能不是很理解，为什么可以这样转变，那我随意定义数字不是每次得出的结论会不一样，这就要牵扯到算法了。
4. 算法。算法是最核心的一环，不管是神经网络训练还是人工智能算法还是线性拟合等等，需要找到一个适合题目的算法时不容易的，要是你队伍里正好有会数学建模的小伙伴，那恭喜你，你赚到了，要是没有也不要气馁，可以baidu呀，也可以参考书籍（可点链接）。我们最终采用了最小二乘法拟合，没参加数学建模的小伙伴可参考上述链接文件，你会有不一样的认知。上述随意取数字的问题也迎刃而解，因为算法会帮你求解各个变量之间的参数矩阵并得到最终结果。
5. 现在大部分算法都是有现成框架的，但是个人建议有能力还是可以自己写一遍。因为我有了matlab代码，我只需要把matlab代码写成python语言即可，这绝对是一个加分项。
6. 现在基本上所有数学建模问题最终都可以用神经网络进行解决，该题目也可以进行，有能力你也可以去尝试，但是对于该题目小容量数据来说不推荐用神经网络。如果你要用神经网络也请好好对题目进行分析，像我上述步骤一样进行关键变量的提取，以及一定要好好的进行数据预处理，在神经网络训练中数据预处理是绝对重要的一环。

3. 程序框架思路

写程序的先后时序性得心中有数，但我想大部分人此方面没啥问题。该问题就是最终你的main.py文件中的运行先后顺序，这也是代码规范中重要的一环。比如我的作品main.py文件中：

代码

import numpy as np
from openpyxl import Workbook
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from numpy.matlib import repmat
import requests
import time
import json
from Spider import Spider
from Spider_april import Spider_april
from Linear import Linear
from Find_average import Find_average
from Predict import Predict
from Plot import Plot
from Plot_april import Plot_april

Spider()
Spider_april()
Plot()
Linear()
Predict()
Find_average()
Plot_april()

import numpy as np

from openpyxl import Workbook

import matplotlib.pyplot as plt

import pandas as pd

from sklearn import preprocessing

from numpy.matlib import repmat

import requests

import time

import json

from Spider import Spider

from Spider_april import Spider_april

from Linear import Linear

from Find_average import Find_average

from Predict import Predict

from Plot import Plot

from Plot_april import Plot_april

Spider()

Spider_april()

Plot()

Linear()

Predict()

Find_average()

Plot_april()

4. 程序规范

像我上述main.py文件展示的一样，请把每个实现不同指标的文件分开写，最终在main.py中import进来，在每个单独小文件测试完后最终在main.py中跑一边确保无误后代码部分就算写完了
每个单独实现不同指标的小文件请遵守以下写法，这个只是最简单的因遵守的格式，你可以及在此基础上进行拓展，如：迭代
查重、注释、最后的自动系统格式等请注意。
如果用到神经网络最好还是用jupyter来写可以节省很多运行时间。个人直接用pycharm写的，因为写习惯了。算法等运行时间较长需要临时保存变量的尽量还是可以用jupyter来写

import numpy as np
import xxx


class Name:
    def __init__(self,object):
        # your code

    def def_1(self, object_1):
        # your code


if __name__ == '__main__':
    Name()

import numpy as np

import xxx

class Name:

def __init__(self,object):

# your code

def def_1(self, object_1):

# your code

if __name__ == '__main__':

Name()

5. 论文报告

最后还需要像参加数学建模一样要验证模型可行性、算法优缺点、算法准确性等，要清晰的在报告中体现，有能力的话还可以再进行改进方案的叙述和实施